1 /* SLP - Basic Block Vectorization 2 Copyright (C) 2007-2017 Free Software Foundation, Inc. 3 Contributed by Dorit Naishlos <dorit@il.ibm.com> 4 and Ira Rosen <irar@il.ibm.com> 5 6 This file is part of GCC. 7 8 GCC is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free 10 Software Foundation; either version 3, or (at your option) any later 11 version. 12 13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or 15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 16 for more details. 17 18 You should have received a copy of the GNU General Public License 19 along with GCC; see the file COPYING3. If not see 20 <http://www.gnu.org/licenses/>. */ 21 22 #include "config.h" 23 #include "system.h" 24 #include "coretypes.h" 25 #include "backend.h" 26 #include "target.h" 27 #include "rtl.h" 28 #include "tree.h" 29 #include "gimple.h" 30 #include "tree-pass.h" 31 #include "ssa.h" 32 #include "optabs-tree.h" 33 #include "insn-config.h" 34 #include "recog.h" /* FIXME: for insn_data */ 35 #include "params.h" 36 #include "fold-const.h" 37 #include "stor-layout.h" 38 #include "gimple-iterator.h" 39 #include "cfgloop.h" 40 #include "tree-vectorizer.h" 41 #include "langhooks.h" 42 #include "gimple-walk.h" 43 #include "dbgcnt.h" 44 45 46 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */ 47 48 static void 49 vect_free_slp_tree (slp_tree node) 50 { 51 int i; 52 slp_tree child; 53 54 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) 55 vect_free_slp_tree (child); 56 57 gimple *stmt; 58 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt) 59 /* After transform some stmts are removed and thus their vinfo is gone. */ 60 if (vinfo_for_stmt (stmt)) 61 { 62 gcc_assert (STMT_VINFO_NUM_SLP_USES (vinfo_for_stmt (stmt)) > 0); 63 STMT_VINFO_NUM_SLP_USES (vinfo_for_stmt (stmt))--; 64 } 65 66 SLP_TREE_CHILDREN (node).release (); 67 SLP_TREE_SCALAR_STMTS (node).release (); 68 SLP_TREE_VEC_STMTS (node).release (); 69 SLP_TREE_LOAD_PERMUTATION (node).release (); 70 71 free (node); 72 } 73 74 75 /* Free the memory allocated for the SLP instance. */ 76 77 void 78 vect_free_slp_instance (slp_instance instance) 79 { 80 vect_free_slp_tree (SLP_INSTANCE_TREE (instance)); 81 SLP_INSTANCE_LOADS (instance).release (); 82 free (instance); 83 } 84 85 86 /* Create an SLP node for SCALAR_STMTS. */ 87 88 static slp_tree 89 vect_create_new_slp_node (vec<gimple *> scalar_stmts) 90 { 91 slp_tree node; 92 gimple *stmt = scalar_stmts[0]; 93 unsigned int nops; 94 95 if (is_gimple_call (stmt)) 96 nops = gimple_call_num_args (stmt); 97 else if (is_gimple_assign (stmt)) 98 { 99 nops = gimple_num_ops (stmt) - 1; 100 if (gimple_assign_rhs_code (stmt) == COND_EXPR) 101 nops++; 102 } 103 else 104 return NULL; 105 106 node = XNEW (struct _slp_tree); 107 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts; 108 SLP_TREE_VEC_STMTS (node).create (0); 109 SLP_TREE_CHILDREN (node).create (nops); 110 SLP_TREE_LOAD_PERMUTATION (node) = vNULL; 111 SLP_TREE_TWO_OPERATORS (node) = false; 112 SLP_TREE_DEF_TYPE (node) = vect_internal_def; 113 114 unsigned i; 115 FOR_EACH_VEC_ELT (scalar_stmts, i, stmt) 116 STMT_VINFO_NUM_SLP_USES (vinfo_for_stmt (stmt))++; 117 118 return node; 119 } 120 121 122 /* This structure is used in creation of an SLP tree. Each instance 123 corresponds to the same operand in a group of scalar stmts in an SLP 124 node. */ 125 typedef struct _slp_oprnd_info 126 { 127 /* Def-stmts for the operands. */ 128 vec<gimple *> def_stmts; 129 /* Information about the first statement, its vector def-type, type, the 130 operand itself in case it's constant, and an indication if it's a pattern 131 stmt. */ 132 tree first_op_type; 133 enum vect_def_type first_dt; 134 bool first_pattern; 135 bool second_pattern; 136 } *slp_oprnd_info; 137 138 139 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each 140 operand. */ 141 static vec<slp_oprnd_info> 142 vect_create_oprnd_info (int nops, int group_size) 143 { 144 int i; 145 slp_oprnd_info oprnd_info; 146 vec<slp_oprnd_info> oprnds_info; 147 148 oprnds_info.create (nops); 149 for (i = 0; i < nops; i++) 150 { 151 oprnd_info = XNEW (struct _slp_oprnd_info); 152 oprnd_info->def_stmts.create (group_size); 153 oprnd_info->first_dt = vect_uninitialized_def; 154 oprnd_info->first_op_type = NULL_TREE; 155 oprnd_info->first_pattern = false; 156 oprnd_info->second_pattern = false; 157 oprnds_info.quick_push (oprnd_info); 158 } 159 160 return oprnds_info; 161 } 162 163 164 /* Free operands info. */ 165 166 static void 167 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info) 168 { 169 int i; 170 slp_oprnd_info oprnd_info; 171 172 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info) 173 { 174 oprnd_info->def_stmts.release (); 175 XDELETE (oprnd_info); 176 } 177 178 oprnds_info.release (); 179 } 180 181 182 /* Find the place of the data-ref in STMT in the interleaving chain that starts 183 from FIRST_STMT. Return -1 if the data-ref is not a part of the chain. */ 184 185 static int 186 vect_get_place_in_interleaving_chain (gimple *stmt, gimple *first_stmt) 187 { 188 gimple *next_stmt = first_stmt; 189 int result = 0; 190 191 if (first_stmt != GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))) 192 return -1; 193 194 do 195 { 196 if (next_stmt == stmt) 197 return result; 198 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt)); 199 if (next_stmt) 200 result += GROUP_GAP (vinfo_for_stmt (next_stmt)); 201 } 202 while (next_stmt); 203 204 return -1; 205 } 206 207 208 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that 209 they are of a valid type and that they match the defs of the first stmt of 210 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts 211 by swapping operands of STMT when possible. Non-zero *SWAP indicates swap 212 is required for cond_expr stmts. Specifically, *SWAP is 1 if STMT is cond 213 and operands of comparison need to be swapped; *SWAP is 2 if STMT is cond 214 and code of comparison needs to be inverted. If there is any operand swap 215 in this function, *SWAP is set to non-zero value. 216 If there was a fatal error return -1; if the error could be corrected by 217 swapping operands of father node of this one, return 1; if everything is 218 ok return 0. */ 219 220 static int 221 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char *swap, 222 gimple *stmt, unsigned stmt_num, 223 vec<slp_oprnd_info> *oprnds_info) 224 { 225 tree oprnd; 226 unsigned int i, number_of_oprnds; 227 gimple *def_stmt; 228 enum vect_def_type dt = vect_uninitialized_def; 229 bool pattern = false; 230 slp_oprnd_info oprnd_info; 231 int first_op_idx = 1; 232 bool commutative = false; 233 bool first_op_cond = false; 234 bool first = stmt_num == 0; 235 bool second = stmt_num == 1; 236 237 if (is_gimple_call (stmt)) 238 { 239 number_of_oprnds = gimple_call_num_args (stmt); 240 first_op_idx = 3; 241 } 242 else if (is_gimple_assign (stmt)) 243 { 244 enum tree_code code = gimple_assign_rhs_code (stmt); 245 number_of_oprnds = gimple_num_ops (stmt) - 1; 246 /* Swap can only be done for cond_expr if asked to, otherwise we 247 could result in different comparison code to the first stmt. */ 248 if (gimple_assign_rhs_code (stmt) == COND_EXPR 249 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt))) 250 { 251 first_op_cond = true; 252 number_of_oprnds++; 253 } 254 else 255 commutative = commutative_tree_code (code); 256 } 257 else 258 return -1; 259 260 bool swapped = (*swap != 0); 261 gcc_assert (!swapped || first_op_cond); 262 for (i = 0; i < number_of_oprnds; i++) 263 { 264 again: 265 if (first_op_cond) 266 { 267 /* Map indicating how operands of cond_expr should be swapped. */ 268 int maps[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}}; 269 int *map = maps[*swap]; 270 271 if (i < 2) 272 oprnd = TREE_OPERAND (gimple_op (stmt, first_op_idx), map[i]); 273 else 274 oprnd = gimple_op (stmt, map[i]); 275 } 276 else 277 oprnd = gimple_op (stmt, first_op_idx + (swapped ? !i : i)); 278 279 oprnd_info = (*oprnds_info)[i]; 280 281 if (!vect_is_simple_use (oprnd, vinfo, &def_stmt, &dt)) 282 { 283 if (dump_enabled_p ()) 284 { 285 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 286 "Build SLP failed: can't analyze def for "); 287 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd); 288 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 289 } 290 291 return -1; 292 } 293 294 /* Check if DEF_STMT is a part of a pattern in LOOP and get the def stmt 295 from the pattern. Check that all the stmts of the node are in the 296 pattern. */ 297 if (def_stmt && gimple_bb (def_stmt) 298 && vect_stmt_in_region_p (vinfo, def_stmt) 299 && vinfo_for_stmt (def_stmt) 300 && STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (def_stmt)) 301 && !STMT_VINFO_RELEVANT (vinfo_for_stmt (def_stmt)) 302 && !STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt))) 303 { 304 pattern = true; 305 if (!first && !oprnd_info->first_pattern 306 /* Allow different pattern state for the defs of the 307 first stmt in reduction chains. */ 308 && (oprnd_info->first_dt != vect_reduction_def 309 || (!second && !oprnd_info->second_pattern))) 310 { 311 if (i == 0 312 && !swapped 313 && commutative) 314 { 315 swapped = true; 316 goto again; 317 } 318 319 if (dump_enabled_p ()) 320 { 321 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 322 "Build SLP failed: some of the stmts" 323 " are in a pattern, and others are not "); 324 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd); 325 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 326 } 327 328 return 1; 329 } 330 331 def_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (def_stmt)); 332 dt = STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt)); 333 334 if (dt == vect_unknown_def_type) 335 { 336 if (dump_enabled_p ()) 337 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 338 "Unsupported pattern.\n"); 339 return -1; 340 } 341 342 switch (gimple_code (def_stmt)) 343 { 344 case GIMPLE_PHI: 345 case GIMPLE_ASSIGN: 346 break; 347 348 default: 349 if (dump_enabled_p ()) 350 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 351 "unsupported defining stmt:\n"); 352 return -1; 353 } 354 } 355 356 if (second) 357 oprnd_info->second_pattern = pattern; 358 359 if (first) 360 { 361 oprnd_info->first_dt = dt; 362 oprnd_info->first_pattern = pattern; 363 oprnd_info->first_op_type = TREE_TYPE (oprnd); 364 } 365 else 366 { 367 /* Not first stmt of the group, check that the def-stmt/s match 368 the def-stmt/s of the first stmt. Allow different definition 369 types for reduction chains: the first stmt must be a 370 vect_reduction_def (a phi node), and the rest 371 vect_internal_def. */ 372 if (((oprnd_info->first_dt != dt 373 && !(oprnd_info->first_dt == vect_reduction_def 374 && dt == vect_internal_def) 375 && !((oprnd_info->first_dt == vect_external_def 376 || oprnd_info->first_dt == vect_constant_def) 377 && (dt == vect_external_def 378 || dt == vect_constant_def))) 379 || !types_compatible_p (oprnd_info->first_op_type, 380 TREE_TYPE (oprnd)))) 381 { 382 /* Try swapping operands if we got a mismatch. */ 383 if (i == 0 384 && !swapped 385 && commutative) 386 { 387 swapped = true; 388 goto again; 389 } 390 391 if (dump_enabled_p ()) 392 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 393 "Build SLP failed: different types\n"); 394 395 return 1; 396 } 397 } 398 399 /* Check the types of the definitions. */ 400 switch (dt) 401 { 402 case vect_constant_def: 403 case vect_external_def: 404 case vect_reduction_def: 405 break; 406 407 case vect_internal_def: 408 oprnd_info->def_stmts.quick_push (def_stmt); 409 break; 410 411 default: 412 /* FORNOW: Not supported. */ 413 if (dump_enabled_p ()) 414 { 415 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 416 "Build SLP failed: illegal type of def "); 417 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd); 418 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 419 } 420 421 return -1; 422 } 423 } 424 425 /* Swap operands. */ 426 if (swapped) 427 { 428 /* If there are already uses of this stmt in a SLP instance then 429 we've committed to the operand order and can't swap it. */ 430 if (STMT_VINFO_NUM_SLP_USES (vinfo_for_stmt (stmt)) != 0) 431 { 432 if (dump_enabled_p ()) 433 { 434 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 435 "Build SLP failed: cannot swap operands of " 436 "shared stmt "); 437 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 438 } 439 return -1; 440 } 441 442 if (first_op_cond) 443 { 444 tree cond = gimple_assign_rhs1 (stmt); 445 enum tree_code code = TREE_CODE (cond); 446 447 /* Swap. */ 448 if (*swap == 1) 449 { 450 swap_ssa_operands (stmt, &TREE_OPERAND (cond, 0), 451 &TREE_OPERAND (cond, 1)); 452 TREE_SET_CODE (cond, swap_tree_comparison (code)); 453 } 454 /* Invert. */ 455 else 456 { 457 swap_ssa_operands (stmt, gimple_assign_rhs2_ptr (stmt), 458 gimple_assign_rhs3_ptr (stmt)); 459 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond, 0)); 460 code = invert_tree_comparison (TREE_CODE (cond), honor_nans); 461 gcc_assert (code != ERROR_MARK); 462 TREE_SET_CODE (cond, code); 463 } 464 } 465 else 466 swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt), 467 gimple_assign_rhs2_ptr (stmt)); 468 if (dump_enabled_p ()) 469 { 470 dump_printf_loc (MSG_NOTE, vect_location, 471 "swapped operands to match def types in "); 472 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0); 473 } 474 } 475 476 *swap = swapped; 477 return 0; 478 } 479 480 481 /* Verify if the scalar stmts STMTS are isomorphic, require data 482 permutation or are of unsupported types of operation. Return 483 true if they are, otherwise return false and indicate in *MATCHES 484 which stmts are not isomorphic to the first one. If MATCHES[0] 485 is false then this indicates the comparison could not be 486 carried out or the stmts will never be vectorized by SLP. 487 488 Note COND_EXPR is possibly ismorphic to another one after swapping its 489 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to 490 the first stmt by swapping the two operands of comparison; set SWAP[i] 491 to 2 if stmt I is isormorphic to the first stmt by inverting the code 492 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped 493 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */ 494 495 static bool 496 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap, 497 vec<gimple *> stmts, unsigned int group_size, 498 unsigned nops, unsigned int *max_nunits, 499 bool *matches, bool *two_operators) 500 { 501 unsigned int i; 502 gimple *first_stmt = stmts[0], *stmt = stmts[0]; 503 enum tree_code first_stmt_code = ERROR_MARK; 504 enum tree_code alt_stmt_code = ERROR_MARK; 505 enum tree_code rhs_code = ERROR_MARK; 506 enum tree_code first_cond_code = ERROR_MARK; 507 tree lhs; 508 bool need_same_oprnds = false; 509 tree vectype = NULL_TREE, scalar_type, first_op1 = NULL_TREE; 510 optab optab; 511 int icode; 512 machine_mode optab_op2_mode; 513 machine_mode vec_mode; 514 HOST_WIDE_INT dummy; 515 gimple *first_load = NULL, *prev_first_load = NULL; 516 517 /* For every stmt in NODE find its def stmt/s. */ 518 FOR_EACH_VEC_ELT (stmts, i, stmt) 519 { 520 swap[i] = 0; 521 matches[i] = false; 522 523 if (dump_enabled_p ()) 524 { 525 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for "); 526 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0); 527 } 528 529 /* Fail to vectorize statements marked as unvectorizable. */ 530 if (!STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt))) 531 { 532 if (dump_enabled_p ()) 533 { 534 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 535 "Build SLP failed: unvectorizable statement "); 536 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 537 } 538 /* Fatal mismatch. */ 539 matches[0] = false; 540 return false; 541 } 542 543 lhs = gimple_get_lhs (stmt); 544 if (lhs == NULL_TREE) 545 { 546 if (dump_enabled_p ()) 547 { 548 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 549 "Build SLP failed: not GIMPLE_ASSIGN nor " 550 "GIMPLE_CALL "); 551 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 552 } 553 /* Fatal mismatch. */ 554 matches[0] = false; 555 return false; 556 } 557 558 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy, &dummy); 559 vectype = get_vectype_for_scalar_type (scalar_type); 560 if (!vectype) 561 { 562 if (dump_enabled_p ()) 563 { 564 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 565 "Build SLP failed: unsupported data-type "); 566 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 567 scalar_type); 568 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 569 } 570 /* Fatal mismatch. */ 571 matches[0] = false; 572 return false; 573 } 574 575 /* If populating the vector type requires unrolling then fail 576 before adjusting *max_nunits for basic-block vectorization. */ 577 if (is_a <bb_vec_info> (vinfo) 578 && TYPE_VECTOR_SUBPARTS (vectype) > group_size) 579 { 580 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 581 "Build SLP failed: unrolling required " 582 "in basic block SLP\n"); 583 /* Fatal mismatch. */ 584 matches[0] = false; 585 return false; 586 } 587 588 /* In case of multiple types we need to detect the smallest type. */ 589 if (*max_nunits < TYPE_VECTOR_SUBPARTS (vectype)) 590 *max_nunits = TYPE_VECTOR_SUBPARTS (vectype); 591 592 if (gcall *call_stmt = dyn_cast <gcall *> (stmt)) 593 { 594 rhs_code = CALL_EXPR; 595 if (gimple_call_internal_p (call_stmt) 596 || gimple_call_tail_p (call_stmt) 597 || gimple_call_noreturn_p (call_stmt) 598 || !gimple_call_nothrow_p (call_stmt) 599 || gimple_call_chain (call_stmt)) 600 { 601 if (dump_enabled_p ()) 602 { 603 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 604 "Build SLP failed: unsupported call type "); 605 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 606 call_stmt, 0); 607 } 608 /* Fatal mismatch. */ 609 matches[0] = false; 610 return false; 611 } 612 } 613 else 614 rhs_code = gimple_assign_rhs_code (stmt); 615 616 /* Check the operation. */ 617 if (i == 0) 618 { 619 first_stmt_code = rhs_code; 620 621 /* Shift arguments should be equal in all the packed stmts for a 622 vector shift with scalar shift operand. */ 623 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR 624 || rhs_code == LROTATE_EXPR 625 || rhs_code == RROTATE_EXPR) 626 { 627 vec_mode = TYPE_MODE (vectype); 628 629 /* First see if we have a vector/vector shift. */ 630 optab = optab_for_tree_code (rhs_code, vectype, 631 optab_vector); 632 633 if (!optab 634 || optab_handler (optab, vec_mode) == CODE_FOR_nothing) 635 { 636 /* No vector/vector shift, try for a vector/scalar shift. */ 637 optab = optab_for_tree_code (rhs_code, vectype, 638 optab_scalar); 639 640 if (!optab) 641 { 642 if (dump_enabled_p ()) 643 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 644 "Build SLP failed: no optab.\n"); 645 /* Fatal mismatch. */ 646 matches[0] = false; 647 return false; 648 } 649 icode = (int) optab_handler (optab, vec_mode); 650 if (icode == CODE_FOR_nothing) 651 { 652 if (dump_enabled_p ()) 653 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 654 "Build SLP failed: " 655 "op not supported by target.\n"); 656 /* Fatal mismatch. */ 657 matches[0] = false; 658 return false; 659 } 660 optab_op2_mode = insn_data[icode].operand[2].mode; 661 if (!VECTOR_MODE_P (optab_op2_mode)) 662 { 663 need_same_oprnds = true; 664 first_op1 = gimple_assign_rhs2 (stmt); 665 } 666 } 667 } 668 else if (rhs_code == WIDEN_LSHIFT_EXPR) 669 { 670 need_same_oprnds = true; 671 first_op1 = gimple_assign_rhs2 (stmt); 672 } 673 } 674 else 675 { 676 if (first_stmt_code != rhs_code 677 && alt_stmt_code == ERROR_MARK) 678 alt_stmt_code = rhs_code; 679 if (first_stmt_code != rhs_code 680 && (first_stmt_code != IMAGPART_EXPR 681 || rhs_code != REALPART_EXPR) 682 && (first_stmt_code != REALPART_EXPR 683 || rhs_code != IMAGPART_EXPR) 684 /* Handle mismatches in plus/minus by computing both 685 and merging the results. */ 686 && !((first_stmt_code == PLUS_EXPR 687 || first_stmt_code == MINUS_EXPR) 688 && (alt_stmt_code == PLUS_EXPR 689 || alt_stmt_code == MINUS_EXPR) 690 && rhs_code == alt_stmt_code) 691 && !(STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)) 692 && (first_stmt_code == ARRAY_REF 693 || first_stmt_code == BIT_FIELD_REF 694 || first_stmt_code == INDIRECT_REF 695 || first_stmt_code == COMPONENT_REF 696 || first_stmt_code == MEM_REF))) 697 { 698 if (dump_enabled_p ()) 699 { 700 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 701 "Build SLP failed: different operation " 702 "in stmt "); 703 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 704 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 705 "original stmt "); 706 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 707 first_stmt, 0); 708 } 709 /* Mismatch. */ 710 continue; 711 } 712 713 if (need_same_oprnds 714 && !operand_equal_p (first_op1, gimple_assign_rhs2 (stmt), 0)) 715 { 716 if (dump_enabled_p ()) 717 { 718 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 719 "Build SLP failed: different shift " 720 "arguments in "); 721 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 722 } 723 /* Mismatch. */ 724 continue; 725 } 726 727 if (rhs_code == CALL_EXPR) 728 { 729 gimple *first_stmt = stmts[0]; 730 if (gimple_call_num_args (stmt) != nops 731 || !operand_equal_p (gimple_call_fn (first_stmt), 732 gimple_call_fn (stmt), 0) 733 || gimple_call_fntype (first_stmt) 734 != gimple_call_fntype (stmt)) 735 { 736 if (dump_enabled_p ()) 737 { 738 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 739 "Build SLP failed: different calls in "); 740 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 741 stmt, 0); 742 } 743 /* Mismatch. */ 744 continue; 745 } 746 } 747 } 748 749 /* Grouped store or load. */ 750 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))) 751 { 752 if (REFERENCE_CLASS_P (lhs)) 753 { 754 /* Store. */ 755 ; 756 } 757 else 758 { 759 /* Load. */ 760 first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)); 761 if (prev_first_load) 762 { 763 /* Check that there are no loads from different interleaving 764 chains in the same node. */ 765 if (prev_first_load != first_load) 766 { 767 if (dump_enabled_p ()) 768 { 769 dump_printf_loc (MSG_MISSED_OPTIMIZATION, 770 vect_location, 771 "Build SLP failed: different " 772 "interleaving chains in one node "); 773 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 774 stmt, 0); 775 } 776 /* Mismatch. */ 777 continue; 778 } 779 } 780 else 781 prev_first_load = first_load; 782 } 783 } /* Grouped access. */ 784 else 785 { 786 if (TREE_CODE_CLASS (rhs_code) == tcc_reference) 787 { 788 /* Not grouped load. */ 789 if (dump_enabled_p ()) 790 { 791 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 792 "Build SLP failed: not grouped load "); 793 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 794 } 795 796 /* FORNOW: Not grouped loads are not supported. */ 797 /* Fatal mismatch. */ 798 matches[0] = false; 799 return false; 800 } 801 802 /* Not memory operation. */ 803 if (TREE_CODE_CLASS (rhs_code) != tcc_binary 804 && TREE_CODE_CLASS (rhs_code) != tcc_unary 805 && TREE_CODE_CLASS (rhs_code) != tcc_expression 806 && TREE_CODE_CLASS (rhs_code) != tcc_comparison 807 && rhs_code != CALL_EXPR) 808 { 809 if (dump_enabled_p ()) 810 { 811 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 812 "Build SLP failed: operation"); 813 dump_printf (MSG_MISSED_OPTIMIZATION, " unsupported "); 814 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 815 } 816 /* Fatal mismatch. */ 817 matches[0] = false; 818 return false; 819 } 820 821 if (rhs_code == COND_EXPR) 822 { 823 tree cond_expr = gimple_assign_rhs1 (stmt); 824 enum tree_code cond_code = TREE_CODE (cond_expr); 825 enum tree_code swap_code = ERROR_MARK; 826 enum tree_code invert_code = ERROR_MARK; 827 828 if (i == 0) 829 first_cond_code = TREE_CODE (cond_expr); 830 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison) 831 { 832 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0)); 833 swap_code = swap_tree_comparison (cond_code); 834 invert_code = invert_tree_comparison (cond_code, honor_nans); 835 } 836 837 if (first_cond_code == cond_code) 838 ; 839 /* Isomorphic can be achieved by swapping. */ 840 else if (first_cond_code == swap_code) 841 swap[i] = 1; 842 /* Isomorphic can be achieved by inverting. */ 843 else if (first_cond_code == invert_code) 844 swap[i] = 2; 845 else 846 { 847 if (dump_enabled_p ()) 848 { 849 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 850 "Build SLP failed: different" 851 " operation"); 852 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 853 stmt, 0); 854 } 855 /* Mismatch. */ 856 continue; 857 } 858 } 859 } 860 861 matches[i] = true; 862 } 863 864 for (i = 0; i < group_size; ++i) 865 if (!matches[i]) 866 return false; 867 868 /* If we allowed a two-operation SLP node verify the target can cope 869 with the permute we are going to use. */ 870 if (alt_stmt_code != ERROR_MARK 871 && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference) 872 { 873 unsigned char *sel 874 = XALLOCAVEC (unsigned char, TYPE_VECTOR_SUBPARTS (vectype)); 875 for (i = 0; i < TYPE_VECTOR_SUBPARTS (vectype); ++i) 876 { 877 sel[i] = i; 878 if (gimple_assign_rhs_code (stmts[i % group_size]) == alt_stmt_code) 879 sel[i] += TYPE_VECTOR_SUBPARTS (vectype); 880 } 881 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) 882 { 883 for (i = 0; i < group_size; ++i) 884 if (gimple_assign_rhs_code (stmts[i]) == alt_stmt_code) 885 { 886 matches[i] = false; 887 if (dump_enabled_p ()) 888 { 889 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 890 "Build SLP failed: different operation " 891 "in stmt "); 892 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 893 stmts[i], 0); 894 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 895 "original stmt "); 896 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 897 first_stmt, 0); 898 } 899 } 900 return false; 901 } 902 *two_operators = true; 903 } 904 905 return true; 906 } 907 908 /* Traits for the hash_set to record failed SLP builds for a stmt set. 909 Note we never remove apart from at destruction time so we do not 910 need a special value for deleted that differs from empty. */ 911 struct bst_traits 912 { 913 typedef vec <gimple *> value_type; 914 typedef vec <gimple *> compare_type; 915 static inline hashval_t hash (value_type); 916 static inline bool equal (value_type existing, value_type candidate); 917 static inline bool is_empty (value_type x) { return !x.exists (); } 918 static inline bool is_deleted (value_type x) { return !x.exists (); } 919 static inline void mark_empty (value_type &x) { x.release (); } 920 static inline void mark_deleted (value_type &x) { x.release (); } 921 static inline void remove (value_type &x) { x.release (); } 922 }; 923 inline hashval_t 924 bst_traits::hash (value_type x) 925 { 926 inchash::hash h; 927 for (unsigned i = 0; i < x.length (); ++i) 928 h.add_int (gimple_uid (x[i])); 929 return h.end (); 930 } 931 inline bool 932 bst_traits::equal (value_type existing, value_type candidate) 933 { 934 if (existing.length () != candidate.length ()) 935 return false; 936 for (unsigned i = 0; i < existing.length (); ++i) 937 if (existing[i] != candidate[i]) 938 return false; 939 return true; 940 } 941 942 static hash_set <vec <gimple *>, bst_traits> *bst_fail; 943 944 static slp_tree 945 vect_build_slp_tree_2 (vec_info *vinfo, 946 vec<gimple *> stmts, unsigned int group_size, 947 unsigned int *max_nunits, 948 vec<slp_tree> *loads, 949 bool *matches, unsigned *npermutes, unsigned *tree_size, 950 unsigned max_tree_size); 951 952 static slp_tree 953 vect_build_slp_tree (vec_info *vinfo, 954 vec<gimple *> stmts, unsigned int group_size, 955 unsigned int *max_nunits, 956 vec<slp_tree> *loads, 957 bool *matches, unsigned *npermutes, unsigned *tree_size, 958 unsigned max_tree_size) 959 { 960 if (bst_fail->contains (stmts)) 961 return NULL; 962 slp_tree res = vect_build_slp_tree_2 (vinfo, stmts, group_size, max_nunits, 963 loads, matches, npermutes, tree_size, 964 max_tree_size); 965 /* When SLP build fails for stmts record this, otherwise SLP build 966 can be exponential in time when we allow to construct parts from 967 scalars, see PR81723. */ 968 if (! res) 969 { 970 vec <gimple *> x; 971 x.create (stmts.length ()); 972 x.splice (stmts); 973 bst_fail->add (x); 974 } 975 return res; 976 } 977 978 /* Recursively build an SLP tree starting from NODE. 979 Fail (and return a value not equal to zero) if def-stmts are not 980 isomorphic, require data permutation or are of unsupported types of 981 operation. Otherwise, return 0. 982 The value returned is the depth in the SLP tree where a mismatch 983 was found. */ 984 985 static slp_tree 986 vect_build_slp_tree_2 (vec_info *vinfo, 987 vec<gimple *> stmts, unsigned int group_size, 988 unsigned int *max_nunits, 989 vec<slp_tree> *loads, 990 bool *matches, unsigned *npermutes, unsigned *tree_size, 991 unsigned max_tree_size) 992 { 993 unsigned nops, i, this_tree_size = 0, this_max_nunits = *max_nunits; 994 gimple *stmt; 995 slp_tree node; 996 997 matches[0] = false; 998 999 stmt = stmts[0]; 1000 if (is_gimple_call (stmt)) 1001 nops = gimple_call_num_args (stmt); 1002 else if (is_gimple_assign (stmt)) 1003 { 1004 nops = gimple_num_ops (stmt) - 1; 1005 if (gimple_assign_rhs_code (stmt) == COND_EXPR) 1006 nops++; 1007 } 1008 else 1009 return NULL; 1010 1011 bool two_operators = false; 1012 unsigned char *swap = XALLOCAVEC (unsigned char, group_size); 1013 if (!vect_build_slp_tree_1 (vinfo, swap, 1014 stmts, group_size, nops, 1015 &this_max_nunits, matches, &two_operators)) 1016 return NULL; 1017 1018 /* If the SLP node is a load, terminate the recursion. */ 1019 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)) 1020 && DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))) 1021 { 1022 *max_nunits = this_max_nunits; 1023 node = vect_create_new_slp_node (stmts); 1024 loads->safe_push (node); 1025 return node; 1026 } 1027 1028 /* Get at the operands, verifying they are compatible. */ 1029 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size); 1030 slp_oprnd_info oprnd_info; 1031 FOR_EACH_VEC_ELT (stmts, i, stmt) 1032 { 1033 int res = vect_get_and_check_slp_defs (vinfo, &swap[i], 1034 stmt, i, &oprnds_info); 1035 if (res != 0) 1036 matches[(res == -1) ? 0 : i] = false; 1037 if (!matches[0]) 1038 break; 1039 } 1040 for (i = 0; i < group_size; ++i) 1041 if (!matches[i]) 1042 { 1043 vect_free_oprnd_info (oprnds_info); 1044 return NULL; 1045 } 1046 1047 auto_vec<slp_tree, 4> children; 1048 auto_vec<slp_tree> this_loads; 1049 1050 stmt = stmts[0]; 1051 1052 if (tree_size) 1053 max_tree_size -= *tree_size; 1054 1055 /* Create SLP_TREE nodes for the definition node/s. */ 1056 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info) 1057 { 1058 slp_tree child; 1059 unsigned old_nloads = this_loads.length (); 1060 unsigned old_tree_size = this_tree_size; 1061 unsigned int j; 1062 1063 if (oprnd_info->first_dt != vect_internal_def) 1064 continue; 1065 1066 if (++this_tree_size > max_tree_size) 1067 { 1068 FOR_EACH_VEC_ELT (children, j, child) 1069 vect_free_slp_tree (child); 1070 vect_free_oprnd_info (oprnds_info); 1071 return NULL; 1072 } 1073 1074 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts, 1075 group_size, &this_max_nunits, 1076 &this_loads, matches, npermutes, 1077 &this_tree_size, 1078 max_tree_size)) != NULL) 1079 { 1080 /* If we have all children of child built up from scalars then just 1081 throw that away and build it up this node from scalars. */ 1082 if (!SLP_TREE_CHILDREN (child).is_empty () 1083 /* ??? Rejecting patterns this way doesn't work. We'd have to 1084 do extra work to cancel the pattern so the uses see the 1085 scalar version. */ 1086 && !is_pattern_stmt_p 1087 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (child)[0]))) 1088 { 1089 slp_tree grandchild; 1090 1091 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild) 1092 if (SLP_TREE_DEF_TYPE (grandchild) == vect_internal_def) 1093 break; 1094 if (!grandchild) 1095 { 1096 /* Roll back. */ 1097 this_loads.truncate (old_nloads); 1098 this_tree_size = old_tree_size; 1099 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild) 1100 vect_free_slp_tree (grandchild); 1101 SLP_TREE_CHILDREN (child).truncate (0); 1102 1103 dump_printf_loc (MSG_NOTE, vect_location, 1104 "Building parent vector operands from " 1105 "scalars instead\n"); 1106 oprnd_info->def_stmts = vNULL; 1107 SLP_TREE_DEF_TYPE (child) = vect_external_def; 1108 children.safe_push (child); 1109 continue; 1110 } 1111 } 1112 1113 oprnd_info->def_stmts = vNULL; 1114 children.safe_push (child); 1115 continue; 1116 } 1117 1118 /* If the SLP build failed fatally and we analyze a basic-block 1119 simply treat nodes we fail to build as externally defined 1120 (and thus build vectors from the scalar defs). 1121 The cost model will reject outright expensive cases. 1122 ??? This doesn't treat cases where permutation ultimatively 1123 fails (or we don't try permutation below). Ideally we'd 1124 even compute a permutation that will end up with the maximum 1125 SLP tree size... */ 1126 if (is_a <bb_vec_info> (vinfo) 1127 && !matches[0] 1128 /* ??? Rejecting patterns this way doesn't work. We'd have to 1129 do extra work to cancel the pattern so the uses see the 1130 scalar version. */ 1131 && !is_pattern_stmt_p (vinfo_for_stmt (stmt))) 1132 { 1133 dump_printf_loc (MSG_NOTE, vect_location, 1134 "Building vector operands from scalars\n"); 1135 child = vect_create_new_slp_node (oprnd_info->def_stmts); 1136 SLP_TREE_DEF_TYPE (child) = vect_external_def; 1137 children.safe_push (child); 1138 oprnd_info->def_stmts = vNULL; 1139 continue; 1140 } 1141 1142 /* If the SLP build for operand zero failed and operand zero 1143 and one can be commutated try that for the scalar stmts 1144 that failed the match. */ 1145 if (i == 0 1146 /* A first scalar stmt mismatch signals a fatal mismatch. */ 1147 && matches[0] 1148 /* ??? For COND_EXPRs we can swap the comparison operands 1149 as well as the arms under some constraints. */ 1150 && nops == 2 1151 && oprnds_info[1]->first_dt == vect_internal_def 1152 && is_gimple_assign (stmt) 1153 && commutative_tree_code (gimple_assign_rhs_code (stmt)) 1154 && ! two_operators 1155 /* Do so only if the number of not successful permutes was nor more 1156 than a cut-ff as re-trying the recursive match on 1157 possibly each level of the tree would expose exponential 1158 behavior. */ 1159 && *npermutes < 4) 1160 { 1161 /* Verify if we can safely swap or if we committed to a specific 1162 operand order already. */ 1163 for (j = 0; j < group_size; ++j) 1164 if (!matches[j] 1165 && (swap[j] != 0 1166 || STMT_VINFO_NUM_SLP_USES (vinfo_for_stmt (stmts[j])))) 1167 { 1168 if (dump_enabled_p ()) 1169 { 1170 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1171 "Build SLP failed: cannot swap operands " 1172 "of shared stmt "); 1173 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 1174 stmts[j], 0); 1175 } 1176 goto fail; 1177 } 1178 1179 /* Swap mismatched definition stmts. */ 1180 dump_printf_loc (MSG_NOTE, vect_location, 1181 "Re-trying with swapped operands of stmts "); 1182 for (j = 0; j < group_size; ++j) 1183 if (!matches[j]) 1184 { 1185 std::swap (oprnds_info[0]->def_stmts[j], 1186 oprnds_info[1]->def_stmts[j]); 1187 dump_printf (MSG_NOTE, "%d ", j); 1188 } 1189 dump_printf (MSG_NOTE, "\n"); 1190 /* And try again with scratch 'matches' ... */ 1191 bool *tem = XALLOCAVEC (bool, group_size); 1192 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts, 1193 group_size, &this_max_nunits, 1194 &this_loads, tem, npermutes, 1195 &this_tree_size, 1196 max_tree_size)) != NULL) 1197 { 1198 /* ... so if successful we can apply the operand swapping 1199 to the GIMPLE IL. This is necessary because for example 1200 vect_get_slp_defs uses operand indexes and thus expects 1201 canonical operand order. This is also necessary even 1202 if we end up building the operand from scalars as 1203 we'll continue to process swapped operand two. */ 1204 for (j = 0; j < group_size; ++j) 1205 { 1206 gimple *stmt = stmts[j]; 1207 gimple_set_plf (stmt, GF_PLF_1, false); 1208 } 1209 for (j = 0; j < group_size; ++j) 1210 { 1211 gimple *stmt = stmts[j]; 1212 if (!matches[j]) 1213 { 1214 /* Avoid swapping operands twice. */ 1215 if (gimple_plf (stmt, GF_PLF_1)) 1216 continue; 1217 swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt), 1218 gimple_assign_rhs2_ptr (stmt)); 1219 gimple_set_plf (stmt, GF_PLF_1, true); 1220 } 1221 } 1222 /* Verify we swap all duplicates or none. */ 1223 if (flag_checking) 1224 for (j = 0; j < group_size; ++j) 1225 { 1226 gimple *stmt = stmts[j]; 1227 gcc_assert (gimple_plf (stmt, GF_PLF_1) == ! matches[j]); 1228 } 1229 1230 /* If we have all children of child built up from scalars then 1231 just throw that away and build it up this node from scalars. */ 1232 if (!SLP_TREE_CHILDREN (child).is_empty () 1233 /* ??? Rejecting patterns this way doesn't work. We'd have 1234 to do extra work to cancel the pattern so the uses see the 1235 scalar version. */ 1236 && !is_pattern_stmt_p 1237 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (child)[0]))) 1238 { 1239 unsigned int j; 1240 slp_tree grandchild; 1241 1242 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild) 1243 if (SLP_TREE_DEF_TYPE (grandchild) == vect_internal_def) 1244 break; 1245 if (!grandchild) 1246 { 1247 /* Roll back. */ 1248 this_loads.truncate (old_nloads); 1249 this_tree_size = old_tree_size; 1250 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild) 1251 vect_free_slp_tree (grandchild); 1252 SLP_TREE_CHILDREN (child).truncate (0); 1253 1254 dump_printf_loc (MSG_NOTE, vect_location, 1255 "Building parent vector operands from " 1256 "scalars instead\n"); 1257 oprnd_info->def_stmts = vNULL; 1258 SLP_TREE_DEF_TYPE (child) = vect_external_def; 1259 children.safe_push (child); 1260 continue; 1261 } 1262 } 1263 1264 oprnd_info->def_stmts = vNULL; 1265 children.safe_push (child); 1266 continue; 1267 } 1268 1269 ++*npermutes; 1270 } 1271 1272 fail: 1273 gcc_assert (child == NULL); 1274 FOR_EACH_VEC_ELT (children, j, child) 1275 vect_free_slp_tree (child); 1276 vect_free_oprnd_info (oprnds_info); 1277 return NULL; 1278 } 1279 1280 vect_free_oprnd_info (oprnds_info); 1281 1282 if (tree_size) 1283 *tree_size += this_tree_size; 1284 *max_nunits = this_max_nunits; 1285 loads->safe_splice (this_loads); 1286 1287 node = vect_create_new_slp_node (stmts); 1288 SLP_TREE_TWO_OPERATORS (node) = two_operators; 1289 SLP_TREE_CHILDREN (node).splice (children); 1290 return node; 1291 } 1292 1293 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */ 1294 1295 static void 1296 vect_print_slp_tree (int dump_kind, location_t loc, slp_tree node) 1297 { 1298 int i; 1299 gimple *stmt; 1300 slp_tree child; 1301 1302 dump_printf_loc (dump_kind, loc, "node%s\n", 1303 SLP_TREE_DEF_TYPE (node) != vect_internal_def 1304 ? " (external)" : ""); 1305 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt) 1306 { 1307 dump_printf_loc (dump_kind, loc, "\tstmt %d ", i); 1308 dump_gimple_stmt (dump_kind, TDF_SLIM, stmt, 0); 1309 } 1310 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) 1311 vect_print_slp_tree (dump_kind, loc, child); 1312 } 1313 1314 1315 /* Mark the tree rooted at NODE with MARK (PURE_SLP or HYBRID). 1316 If MARK is HYBRID, it refers to a specific stmt in NODE (the stmt at index 1317 J). Otherwise, MARK is PURE_SLP and J is -1, which indicates that all the 1318 stmts in NODE are to be marked. */ 1319 1320 static void 1321 vect_mark_slp_stmts (slp_tree node, enum slp_vect_type mark, int j) 1322 { 1323 int i; 1324 gimple *stmt; 1325 slp_tree child; 1326 1327 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def) 1328 return; 1329 1330 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt) 1331 if (j < 0 || i == j) 1332 STMT_SLP_TYPE (vinfo_for_stmt (stmt)) = mark; 1333 1334 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) 1335 vect_mark_slp_stmts (child, mark, j); 1336 } 1337 1338 1339 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */ 1340 1341 static void 1342 vect_mark_slp_stmts_relevant (slp_tree node) 1343 { 1344 int i; 1345 gimple *stmt; 1346 stmt_vec_info stmt_info; 1347 slp_tree child; 1348 1349 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def) 1350 return; 1351 1352 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt) 1353 { 1354 stmt_info = vinfo_for_stmt (stmt); 1355 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info) 1356 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope); 1357 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope; 1358 } 1359 1360 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) 1361 vect_mark_slp_stmts_relevant (child); 1362 } 1363 1364 1365 /* Rearrange the statements of NODE according to PERMUTATION. */ 1366 1367 static void 1368 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size, 1369 vec<unsigned> permutation) 1370 { 1371 gimple *stmt; 1372 vec<gimple *> tmp_stmts; 1373 unsigned int i; 1374 slp_tree child; 1375 1376 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) 1377 vect_slp_rearrange_stmts (child, group_size, permutation); 1378 1379 gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ()); 1380 tmp_stmts.create (group_size); 1381 tmp_stmts.quick_grow_cleared (group_size); 1382 1383 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt) 1384 tmp_stmts[permutation[i]] = stmt; 1385 1386 SLP_TREE_SCALAR_STMTS (node).release (); 1387 SLP_TREE_SCALAR_STMTS (node) = tmp_stmts; 1388 } 1389 1390 1391 /* Attempt to reorder stmts in a reduction chain so that we don't 1392 require any load permutation. Return true if that was possible, 1393 otherwise return false. */ 1394 1395 static bool 1396 vect_attempt_slp_rearrange_stmts (slp_instance slp_instn) 1397 { 1398 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn); 1399 unsigned int i, j; 1400 unsigned int lidx; 1401 slp_tree node, load; 1402 1403 /* Compare all the permutation sequences to the first one. We know 1404 that at least one load is permuted. */ 1405 node = SLP_INSTANCE_LOADS (slp_instn)[0]; 1406 if (!node->load_permutation.exists ()) 1407 return false; 1408 for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i) 1409 { 1410 if (!load->load_permutation.exists ()) 1411 return false; 1412 FOR_EACH_VEC_ELT (load->load_permutation, j, lidx) 1413 if (lidx != node->load_permutation[j]) 1414 return false; 1415 } 1416 1417 /* Check that the loads in the first sequence are different and there 1418 are no gaps between them. */ 1419 auto_sbitmap load_index (group_size); 1420 bitmap_clear (load_index); 1421 FOR_EACH_VEC_ELT (node->load_permutation, i, lidx) 1422 { 1423 if (lidx >= group_size) 1424 return false; 1425 if (bitmap_bit_p (load_index, lidx)) 1426 return false; 1427 1428 bitmap_set_bit (load_index, lidx); 1429 } 1430 for (i = 0; i < group_size; i++) 1431 if (!bitmap_bit_p (load_index, i)) 1432 return false; 1433 1434 /* This permutation is valid for reduction. Since the order of the 1435 statements in the nodes is not important unless they are memory 1436 accesses, we can rearrange the statements in all the nodes 1437 according to the order of the loads. */ 1438 vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size, 1439 node->load_permutation); 1440 1441 /* We are done, no actual permutations need to be generated. */ 1442 unsigned int unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_instn); 1443 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node) 1444 { 1445 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0]; 1446 first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt)); 1447 /* But we have to keep those permutations that are required because 1448 of handling of gaps. */ 1449 if (unrolling_factor == 1 1450 || (group_size == GROUP_SIZE (vinfo_for_stmt (first_stmt)) 1451 && GROUP_GAP (vinfo_for_stmt (first_stmt)) == 0)) 1452 SLP_TREE_LOAD_PERMUTATION (node).release (); 1453 else 1454 for (j = 0; j < SLP_TREE_LOAD_PERMUTATION (node).length (); ++j) 1455 SLP_TREE_LOAD_PERMUTATION (node)[j] = j; 1456 } 1457 1458 return true; 1459 } 1460 1461 /* Check if the required load permutations in the SLP instance 1462 SLP_INSTN are supported. */ 1463 1464 static bool 1465 vect_supported_load_permutation_p (slp_instance slp_instn) 1466 { 1467 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn); 1468 unsigned int i, j, k, next; 1469 slp_tree node; 1470 gimple *stmt, *load, *next_load; 1471 1472 if (dump_enabled_p ()) 1473 { 1474 dump_printf_loc (MSG_NOTE, vect_location, "Load permutation "); 1475 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node) 1476 if (node->load_permutation.exists ()) 1477 FOR_EACH_VEC_ELT (node->load_permutation, j, next) 1478 dump_printf (MSG_NOTE, "%d ", next); 1479 else 1480 for (k = 0; k < group_size; ++k) 1481 dump_printf (MSG_NOTE, "%d ", k); 1482 dump_printf (MSG_NOTE, "\n"); 1483 } 1484 1485 /* In case of reduction every load permutation is allowed, since the order 1486 of the reduction statements is not important (as opposed to the case of 1487 grouped stores). The only condition we need to check is that all the 1488 load nodes are of the same size and have the same permutation (and then 1489 rearrange all the nodes of the SLP instance according to this 1490 permutation). */ 1491 1492 /* Check that all the load nodes are of the same size. */ 1493 /* ??? Can't we assert this? */ 1494 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node) 1495 if (SLP_TREE_SCALAR_STMTS (node).length () != (unsigned) group_size) 1496 return false; 1497 1498 node = SLP_INSTANCE_TREE (slp_instn); 1499 stmt = SLP_TREE_SCALAR_STMTS (node)[0]; 1500 1501 /* Reduction (there are no data-refs in the root). 1502 In reduction chain the order of the loads is not important. */ 1503 if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)) 1504 && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))) 1505 vect_attempt_slp_rearrange_stmts (slp_instn); 1506 1507 /* In basic block vectorization we allow any subchain of an interleaving 1508 chain. 1509 FORNOW: not supported in loop SLP because of realignment compications. */ 1510 if (STMT_VINFO_BB_VINFO (vinfo_for_stmt (stmt))) 1511 { 1512 /* Check whether the loads in an instance form a subchain and thus 1513 no permutation is necessary. */ 1514 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node) 1515 { 1516 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ()) 1517 continue; 1518 bool subchain_p = true; 1519 next_load = NULL; 1520 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load) 1521 { 1522 if (j != 0 1523 && (next_load != load 1524 || GROUP_GAP (vinfo_for_stmt (load)) != 1)) 1525 { 1526 subchain_p = false; 1527 break; 1528 } 1529 next_load = GROUP_NEXT_ELEMENT (vinfo_for_stmt (load)); 1530 } 1531 if (subchain_p) 1532 SLP_TREE_LOAD_PERMUTATION (node).release (); 1533 else 1534 { 1535 stmt_vec_info group_info 1536 = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]); 1537 group_info = vinfo_for_stmt (GROUP_FIRST_ELEMENT (group_info)); 1538 unsigned nunits 1539 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (group_info)); 1540 unsigned k, maxk = 0; 1541 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), j, k) 1542 if (k > maxk) 1543 maxk = k; 1544 /* In BB vectorization we may not actually use a loaded vector 1545 accessing elements in excess of GROUP_SIZE. */ 1546 if (maxk >= (GROUP_SIZE (group_info) & ~(nunits - 1))) 1547 { 1548 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1549 "BB vectorization with gaps at the end of " 1550 "a load is not supported\n"); 1551 return false; 1552 } 1553 1554 /* Verify the permutation can be generated. */ 1555 vec<tree> tem; 1556 unsigned n_perms; 1557 if (!vect_transform_slp_perm_load (node, tem, NULL, 1558 1, slp_instn, true, &n_perms)) 1559 { 1560 dump_printf_loc (MSG_MISSED_OPTIMIZATION, 1561 vect_location, 1562 "unsupported load permutation\n"); 1563 return false; 1564 } 1565 } 1566 } 1567 return true; 1568 } 1569 1570 /* For loop vectorization verify we can generate the permutation. Be 1571 conservative about the vectorization factor, there are permutations 1572 that will use three vector inputs only starting from a specific factor 1573 and the vectorization factor is not yet final. 1574 ??? The SLP instance unrolling factor might not be the maximum one. */ 1575 unsigned n_perms; 1576 unsigned test_vf 1577 = least_common_multiple (SLP_INSTANCE_UNROLLING_FACTOR (slp_instn), 1578 LOOP_VINFO_VECT_FACTOR 1579 (STMT_VINFO_LOOP_VINFO (vinfo_for_stmt (stmt)))); 1580 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node) 1581 if (node->load_permutation.exists () 1582 && !vect_transform_slp_perm_load (node, vNULL, NULL, test_vf, 1583 slp_instn, true, &n_perms)) 1584 return false; 1585 1586 return true; 1587 } 1588 1589 1590 /* Find the last store in SLP INSTANCE. */ 1591 1592 gimple * 1593 vect_find_last_scalar_stmt_in_slp (slp_tree node) 1594 { 1595 gimple *last = NULL, *stmt; 1596 1597 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt); i++) 1598 { 1599 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt); 1600 if (is_pattern_stmt_p (stmt_vinfo)) 1601 last = get_later_stmt (STMT_VINFO_RELATED_STMT (stmt_vinfo), last); 1602 else 1603 last = get_later_stmt (stmt, last); 1604 } 1605 1606 return last; 1607 } 1608 1609 /* Compute the cost for the SLP node NODE in the SLP instance INSTANCE. */ 1610 1611 static void 1612 vect_analyze_slp_cost_1 (slp_instance instance, slp_tree node, 1613 stmt_vector_for_cost *prologue_cost_vec, 1614 stmt_vector_for_cost *body_cost_vec, 1615 unsigned ncopies_for_cost) 1616 { 1617 unsigned i, j; 1618 slp_tree child; 1619 gimple *stmt; 1620 stmt_vec_info stmt_info; 1621 tree lhs; 1622 1623 /* Recurse down the SLP tree. */ 1624 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) 1625 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def) 1626 vect_analyze_slp_cost_1 (instance, child, prologue_cost_vec, 1627 body_cost_vec, ncopies_for_cost); 1628 1629 /* Look at the first scalar stmt to determine the cost. */ 1630 stmt = SLP_TREE_SCALAR_STMTS (node)[0]; 1631 stmt_info = vinfo_for_stmt (stmt); 1632 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) 1633 { 1634 vect_memory_access_type memory_access_type 1635 = (STMT_VINFO_STRIDED_P (stmt_info) 1636 ? VMAT_STRIDED_SLP 1637 : VMAT_CONTIGUOUS); 1638 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info))) 1639 vect_model_store_cost (stmt_info, ncopies_for_cost, 1640 memory_access_type, vect_uninitialized_def, 1641 node, prologue_cost_vec, body_cost_vec); 1642 else 1643 { 1644 gcc_checking_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))); 1645 if (SLP_TREE_LOAD_PERMUTATION (node).exists ()) 1646 { 1647 /* If the load is permuted then the alignment is determined by 1648 the first group element not by the first scalar stmt DR. */ 1649 stmt = GROUP_FIRST_ELEMENT (stmt_info); 1650 stmt_info = vinfo_for_stmt (stmt); 1651 /* Record the cost for the permutation. */ 1652 unsigned n_perms; 1653 vect_transform_slp_perm_load (node, vNULL, NULL, 1654 ncopies_for_cost, instance, true, 1655 &n_perms); 1656 record_stmt_cost (body_cost_vec, n_perms, vec_perm, 1657 stmt_info, 0, vect_body); 1658 unsigned nunits 1659 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)); 1660 /* And adjust the number of loads performed. This handles 1661 redundancies as well as loads that are later dead. */ 1662 auto_sbitmap perm (GROUP_SIZE (stmt_info)); 1663 bitmap_clear (perm); 1664 for (i = 0; i < SLP_TREE_LOAD_PERMUTATION (node).length (); ++i) 1665 bitmap_set_bit (perm, SLP_TREE_LOAD_PERMUTATION (node)[i]); 1666 ncopies_for_cost = 0; 1667 bool load_seen = false; 1668 for (i = 0; i < GROUP_SIZE (stmt_info); ++i) 1669 { 1670 if (i % nunits == 0) 1671 { 1672 if (load_seen) 1673 ncopies_for_cost++; 1674 load_seen = false; 1675 } 1676 if (bitmap_bit_p (perm, i)) 1677 load_seen = true; 1678 } 1679 if (load_seen) 1680 ncopies_for_cost++; 1681 gcc_assert (ncopies_for_cost 1682 <= (GROUP_SIZE (stmt_info) - GROUP_GAP (stmt_info) 1683 + nunits - 1) / nunits); 1684 ncopies_for_cost *= SLP_INSTANCE_UNROLLING_FACTOR (instance); 1685 } 1686 /* Record the cost for the vector loads. */ 1687 vect_model_load_cost (stmt_info, ncopies_for_cost, 1688 memory_access_type, node, prologue_cost_vec, 1689 body_cost_vec); 1690 return; 1691 } 1692 } 1693 else 1694 { 1695 record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt, 1696 stmt_info, 0, vect_body); 1697 if (SLP_TREE_TWO_OPERATORS (node)) 1698 { 1699 record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt, 1700 stmt_info, 0, vect_body); 1701 record_stmt_cost (body_cost_vec, ncopies_for_cost, vec_perm, 1702 stmt_info, 0, vect_body); 1703 } 1704 } 1705 1706 /* Push SLP node def-type to stmts. */ 1707 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) 1708 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def) 1709 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, stmt) 1710 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = SLP_TREE_DEF_TYPE (child); 1711 1712 /* Scan operands and account for prologue cost of constants/externals. 1713 ??? This over-estimates cost for multiple uses and should be 1714 re-engineered. */ 1715 stmt = SLP_TREE_SCALAR_STMTS (node)[0]; 1716 lhs = gimple_get_lhs (stmt); 1717 for (i = 0; i < gimple_num_ops (stmt); ++i) 1718 { 1719 tree op = gimple_op (stmt, i); 1720 gimple *def_stmt; 1721 enum vect_def_type dt; 1722 if (!op || op == lhs) 1723 continue; 1724 if (vect_is_simple_use (op, stmt_info->vinfo, &def_stmt, &dt)) 1725 { 1726 /* Without looking at the actual initializer a vector of 1727 constants can be implemented as load from the constant pool. 1728 ??? We need to pass down stmt_info for a vector type 1729 even if it points to the wrong stmt. */ 1730 if (dt == vect_constant_def) 1731 record_stmt_cost (prologue_cost_vec, 1, vector_load, 1732 stmt_info, 0, vect_prologue); 1733 else if (dt == vect_external_def) 1734 record_stmt_cost (prologue_cost_vec, 1, vec_construct, 1735 stmt_info, 0, vect_prologue); 1736 } 1737 } 1738 1739 /* Restore stmt def-types. */ 1740 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) 1741 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def) 1742 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, stmt) 1743 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = vect_internal_def; 1744 } 1745 1746 /* Compute the cost for the SLP instance INSTANCE. */ 1747 1748 static void 1749 vect_analyze_slp_cost (slp_instance instance, void *data) 1750 { 1751 stmt_vector_for_cost body_cost_vec, prologue_cost_vec; 1752 unsigned ncopies_for_cost; 1753 stmt_info_for_cost *si; 1754 unsigned i; 1755 1756 if (dump_enabled_p ()) 1757 dump_printf_loc (MSG_NOTE, vect_location, 1758 "=== vect_analyze_slp_cost ===\n"); 1759 1760 /* Calculate the number of vector stmts to create based on the unrolling 1761 factor (number of vectors is 1 if NUNITS >= GROUP_SIZE, and is 1762 GROUP_SIZE / NUNITS otherwise. */ 1763 unsigned group_size = SLP_INSTANCE_GROUP_SIZE (instance); 1764 slp_tree node = SLP_INSTANCE_TREE (instance); 1765 stmt_vec_info stmt_info = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]); 1766 /* Adjust the group_size by the vectorization factor which is always one 1767 for basic-block vectorization. */ 1768 if (STMT_VINFO_LOOP_VINFO (stmt_info)) 1769 group_size *= LOOP_VINFO_VECT_FACTOR (STMT_VINFO_LOOP_VINFO (stmt_info)); 1770 unsigned nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)); 1771 /* For reductions look at a reduction operand in case the reduction 1772 operation is widening like DOT_PROD or SAD. */ 1773 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)) 1774 { 1775 gimple *stmt = SLP_TREE_SCALAR_STMTS (node)[0]; 1776 switch (gimple_assign_rhs_code (stmt)) 1777 { 1778 case DOT_PROD_EXPR: 1779 case SAD_EXPR: 1780 nunits = TYPE_VECTOR_SUBPARTS (get_vectype_for_scalar_type 1781 (TREE_TYPE (gimple_assign_rhs1 (stmt)))); 1782 break; 1783 default:; 1784 } 1785 } 1786 ncopies_for_cost = least_common_multiple (nunits, group_size) / nunits; 1787 1788 prologue_cost_vec.create (10); 1789 body_cost_vec.create (10); 1790 vect_analyze_slp_cost_1 (instance, SLP_INSTANCE_TREE (instance), 1791 &prologue_cost_vec, &body_cost_vec, 1792 ncopies_for_cost); 1793 1794 /* Record the prologue costs, which were delayed until we were 1795 sure that SLP was successful. */ 1796 FOR_EACH_VEC_ELT (prologue_cost_vec, i, si) 1797 { 1798 struct _stmt_vec_info *stmt_info 1799 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL; 1800 (void) add_stmt_cost (data, si->count, si->kind, stmt_info, 1801 si->misalign, vect_prologue); 1802 } 1803 1804 /* Record the instance's instructions in the target cost model. */ 1805 FOR_EACH_VEC_ELT (body_cost_vec, i, si) 1806 { 1807 struct _stmt_vec_info *stmt_info 1808 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL; 1809 (void) add_stmt_cost (data, si->count, si->kind, stmt_info, 1810 si->misalign, vect_body); 1811 } 1812 1813 prologue_cost_vec.release (); 1814 body_cost_vec.release (); 1815 } 1816 1817 /* Splits a group of stores, currently beginning at FIRST_STMT, into two groups: 1818 one (still beginning at FIRST_STMT) of size GROUP1_SIZE (also containing 1819 the first GROUP1_SIZE stmts, since stores are consecutive), the second 1820 containing the remainder. 1821 Return the first stmt in the second group. */ 1822 1823 static gimple * 1824 vect_split_slp_store_group (gimple *first_stmt, unsigned group1_size) 1825 { 1826 stmt_vec_info first_vinfo = vinfo_for_stmt (first_stmt); 1827 gcc_assert (GROUP_FIRST_ELEMENT (first_vinfo) == first_stmt); 1828 gcc_assert (group1_size > 0); 1829 int group2_size = GROUP_SIZE (first_vinfo) - group1_size; 1830 gcc_assert (group2_size > 0); 1831 GROUP_SIZE (first_vinfo) = group1_size; 1832 1833 gimple *stmt = first_stmt; 1834 for (unsigned i = group1_size; i > 1; i--) 1835 { 1836 stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt)); 1837 gcc_assert (GROUP_GAP (vinfo_for_stmt (stmt)) == 1); 1838 } 1839 /* STMT is now the last element of the first group. */ 1840 gimple *group2 = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt)); 1841 GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt)) = 0; 1842 1843 GROUP_SIZE (vinfo_for_stmt (group2)) = group2_size; 1844 for (stmt = group2; stmt; stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt))) 1845 { 1846 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = group2; 1847 gcc_assert (GROUP_GAP (vinfo_for_stmt (stmt)) == 1); 1848 } 1849 1850 /* For the second group, the GROUP_GAP is that before the original group, 1851 plus skipping over the first vector. */ 1852 GROUP_GAP (vinfo_for_stmt (group2)) = 1853 GROUP_GAP (first_vinfo) + group1_size; 1854 1855 /* GROUP_GAP of the first group now has to skip over the second group too. */ 1856 GROUP_GAP (first_vinfo) += group2_size; 1857 1858 if (dump_enabled_p ()) 1859 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n", 1860 group1_size, group2_size); 1861 1862 return group2; 1863 } 1864 1865 /* Analyze an SLP instance starting from a group of grouped stores. Call 1866 vect_build_slp_tree to build a tree of packed stmts if possible. 1867 Return FALSE if it's impossible to SLP any stmt in the loop. */ 1868 1869 static bool 1870 vect_analyze_slp_instance (vec_info *vinfo, 1871 gimple *stmt, unsigned max_tree_size) 1872 { 1873 slp_instance new_instance; 1874 slp_tree node; 1875 unsigned int group_size = GROUP_SIZE (vinfo_for_stmt (stmt)); 1876 unsigned int unrolling_factor = 1, nunits; 1877 tree vectype, scalar_type = NULL_TREE; 1878 gimple *next; 1879 unsigned int i; 1880 unsigned int max_nunits = 0; 1881 vec<slp_tree> loads; 1882 struct data_reference *dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)); 1883 vec<gimple *> scalar_stmts; 1884 1885 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))) 1886 { 1887 if (dr) 1888 { 1889 scalar_type = TREE_TYPE (DR_REF (dr)); 1890 vectype = get_vectype_for_scalar_type (scalar_type); 1891 } 1892 else 1893 { 1894 gcc_assert (is_a <loop_vec_info> (vinfo)); 1895 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)); 1896 } 1897 1898 group_size = GROUP_SIZE (vinfo_for_stmt (stmt)); 1899 } 1900 else 1901 { 1902 gcc_assert (is_a <loop_vec_info> (vinfo)); 1903 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)); 1904 group_size = as_a <loop_vec_info> (vinfo)->reductions.length (); 1905 } 1906 1907 if (!vectype) 1908 { 1909 if (dump_enabled_p ()) 1910 { 1911 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1912 "Build SLP failed: unsupported data-type "); 1913 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, scalar_type); 1914 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 1915 } 1916 1917 return false; 1918 } 1919 nunits = TYPE_VECTOR_SUBPARTS (vectype); 1920 1921 /* Create a node (a root of the SLP tree) for the packed grouped stores. */ 1922 scalar_stmts.create (group_size); 1923 next = stmt; 1924 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))) 1925 { 1926 /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */ 1927 while (next) 1928 { 1929 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next)) 1930 && STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next))) 1931 scalar_stmts.safe_push ( 1932 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next))); 1933 else 1934 scalar_stmts.safe_push (next); 1935 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next)); 1936 } 1937 /* Mark the first element of the reduction chain as reduction to properly 1938 transform the node. In the reduction analysis phase only the last 1939 element of the chain is marked as reduction. */ 1940 if (!STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))) 1941 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = vect_reduction_def; 1942 } 1943 else 1944 { 1945 /* Collect reduction statements. */ 1946 vec<gimple *> reductions = as_a <loop_vec_info> (vinfo)->reductions; 1947 for (i = 0; reductions.iterate (i, &next); i++) 1948 scalar_stmts.safe_push (next); 1949 } 1950 1951 loads.create (group_size); 1952 1953 /* Build the tree for the SLP instance. */ 1954 bool *matches = XALLOCAVEC (bool, group_size); 1955 unsigned npermutes = 0; 1956 bst_fail = new hash_set <vec <gimple *>, bst_traits> (); 1957 node = vect_build_slp_tree (vinfo, scalar_stmts, group_size, 1958 &max_nunits, &loads, matches, &npermutes, 1959 NULL, max_tree_size); 1960 delete bst_fail; 1961 if (node != NULL) 1962 { 1963 /* Calculate the unrolling factor based on the smallest type. */ 1964 unrolling_factor 1965 = least_common_multiple (max_nunits, group_size) / group_size; 1966 1967 if (unrolling_factor != 1 1968 && is_a <bb_vec_info> (vinfo)) 1969 { 1970 1971 if (max_nunits > group_size) 1972 { 1973 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1974 "Build SLP failed: store group " 1975 "size not a multiple of the vector size " 1976 "in basic block SLP\n"); 1977 vect_free_slp_tree (node); 1978 loads.release (); 1979 return false; 1980 } 1981 /* Fatal mismatch. */ 1982 matches[group_size/max_nunits * max_nunits] = false; 1983 vect_free_slp_tree (node); 1984 loads.release (); 1985 } 1986 else 1987 { 1988 /* Create a new SLP instance. */ 1989 new_instance = XNEW (struct _slp_instance); 1990 SLP_INSTANCE_TREE (new_instance) = node; 1991 SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size; 1992 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor; 1993 SLP_INSTANCE_LOADS (new_instance) = loads; 1994 1995 /* Compute the load permutation. */ 1996 slp_tree load_node; 1997 bool loads_permuted = false; 1998 FOR_EACH_VEC_ELT (loads, i, load_node) 1999 { 2000 vec<unsigned> load_permutation; 2001 int j; 2002 gimple *load, *first_stmt; 2003 bool this_load_permuted = false; 2004 load_permutation.create (group_size); 2005 first_stmt = GROUP_FIRST_ELEMENT 2006 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (load_node)[0])); 2007 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load) 2008 { 2009 int load_place = vect_get_place_in_interleaving_chain 2010 (load, first_stmt); 2011 gcc_assert (load_place != -1); 2012 if (load_place != j) 2013 this_load_permuted = true; 2014 load_permutation.safe_push (load_place); 2015 } 2016 if (!this_load_permuted 2017 /* The load requires permutation when unrolling exposes 2018 a gap either because the group is larger than the SLP 2019 group-size or because there is a gap between the groups. */ 2020 && (unrolling_factor == 1 2021 || (group_size == GROUP_SIZE (vinfo_for_stmt (first_stmt)) 2022 && GROUP_GAP (vinfo_for_stmt (first_stmt)) == 0))) 2023 { 2024 load_permutation.release (); 2025 continue; 2026 } 2027 SLP_TREE_LOAD_PERMUTATION (load_node) = load_permutation; 2028 loads_permuted = true; 2029 } 2030 2031 if (loads_permuted) 2032 { 2033 if (!vect_supported_load_permutation_p (new_instance)) 2034 { 2035 if (dump_enabled_p ()) 2036 { 2037 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2038 "Build SLP failed: unsupported load " 2039 "permutation "); 2040 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, 2041 TDF_SLIM, stmt, 0); 2042 } 2043 vect_free_slp_instance (new_instance); 2044 return false; 2045 } 2046 } 2047 2048 /* If the loads and stores can be handled with load/store-lan 2049 instructions do not generate this SLP instance. */ 2050 if (is_a <loop_vec_info> (vinfo) 2051 && loads_permuted 2052 && dr && vect_store_lanes_supported (vectype, group_size)) 2053 { 2054 slp_tree load_node; 2055 FOR_EACH_VEC_ELT (loads, i, load_node) 2056 { 2057 gimple *first_stmt = GROUP_FIRST_ELEMENT 2058 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (load_node)[0])); 2059 stmt_vec_info stmt_vinfo = vinfo_for_stmt (first_stmt); 2060 /* Use SLP for strided accesses (or if we 2061 can't load-lanes). */ 2062 if (STMT_VINFO_STRIDED_P (stmt_vinfo) 2063 || ! vect_load_lanes_supported 2064 (STMT_VINFO_VECTYPE (stmt_vinfo), 2065 GROUP_SIZE (stmt_vinfo))) 2066 break; 2067 } 2068 if (i == loads.length ()) 2069 { 2070 if (dump_enabled_p ()) 2071 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2072 "Built SLP cancelled: can use " 2073 "load/store-lanes\n"); 2074 vect_free_slp_instance (new_instance); 2075 return false; 2076 } 2077 } 2078 2079 vinfo->slp_instances.safe_push (new_instance); 2080 2081 if (dump_enabled_p ()) 2082 { 2083 dump_printf_loc (MSG_NOTE, vect_location, 2084 "Final SLP tree for instance:\n"); 2085 vect_print_slp_tree (MSG_NOTE, vect_location, node); 2086 } 2087 2088 return true; 2089 } 2090 } 2091 else 2092 { 2093 /* Failed to SLP. */ 2094 /* Free the allocated memory. */ 2095 scalar_stmts.release (); 2096 loads.release (); 2097 } 2098 2099 /* For basic block SLP, try to break the group up into multiples of the 2100 vector size. */ 2101 if (is_a <bb_vec_info> (vinfo) 2102 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) 2103 && STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))) 2104 { 2105 /* We consider breaking the group only on VF boundaries from the existing 2106 start. */ 2107 for (i = 0; i < group_size; i++) 2108 if (!matches[i]) break; 2109 2110 if (i >= nunits && i < group_size) 2111 { 2112 /* Split into two groups at the first vector boundary before i. */ 2113 gcc_assert ((nunits & (nunits - 1)) == 0); 2114 unsigned group1_size = i & ~(nunits - 1); 2115 2116 gimple *rest = vect_split_slp_store_group (stmt, group1_size); 2117 bool res = vect_analyze_slp_instance (vinfo, stmt, max_tree_size); 2118 /* If the first non-match was in the middle of a vector, 2119 skip the rest of that vector. */ 2120 if (group1_size < i) 2121 { 2122 i = group1_size + nunits; 2123 if (i < group_size) 2124 rest = vect_split_slp_store_group (rest, nunits); 2125 } 2126 if (i < group_size) 2127 res |= vect_analyze_slp_instance (vinfo, rest, max_tree_size); 2128 return res; 2129 } 2130 /* Even though the first vector did not all match, we might be able to SLP 2131 (some) of the remainder. FORNOW ignore this possibility. */ 2132 } 2133 2134 return false; 2135 } 2136 2137 2138 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP 2139 trees of packed scalar stmts if SLP is possible. */ 2140 2141 bool 2142 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size) 2143 { 2144 unsigned int i; 2145 gimple *first_element; 2146 bool ok = false; 2147 2148 if (dump_enabled_p ()) 2149 dump_printf_loc (MSG_NOTE, vect_location, "=== vect_analyze_slp ===\n"); 2150 2151 /* Find SLP sequences starting from groups of grouped stores. */ 2152 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element) 2153 if (vect_analyze_slp_instance (vinfo, first_element, max_tree_size)) 2154 ok = true; 2155 2156 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo)) 2157 { 2158 if (loop_vinfo->reduction_chains.length () > 0) 2159 { 2160 /* Find SLP sequences starting from reduction chains. */ 2161 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element) 2162 if (vect_analyze_slp_instance (vinfo, first_element, 2163 max_tree_size)) 2164 ok = true; 2165 else 2166 return false; 2167 2168 /* Don't try to vectorize SLP reductions if reduction chain was 2169 detected. */ 2170 return ok; 2171 } 2172 2173 /* Find SLP sequences starting from groups of reductions. */ 2174 if (loop_vinfo->reductions.length () > 1 2175 && vect_analyze_slp_instance (vinfo, loop_vinfo->reductions[0], 2176 max_tree_size)) 2177 ok = true; 2178 } 2179 2180 return true; 2181 } 2182 2183 2184 /* For each possible SLP instance decide whether to SLP it and calculate overall 2185 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at 2186 least one instance. */ 2187 2188 bool 2189 vect_make_slp_decision (loop_vec_info loop_vinfo) 2190 { 2191 unsigned int i, unrolling_factor = 1; 2192 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo); 2193 slp_instance instance; 2194 int decided_to_slp = 0; 2195 2196 if (dump_enabled_p ()) 2197 dump_printf_loc (MSG_NOTE, vect_location, "=== vect_make_slp_decision ===" 2198 "\n"); 2199 2200 FOR_EACH_VEC_ELT (slp_instances, i, instance) 2201 { 2202 /* FORNOW: SLP if you can. */ 2203 if (unrolling_factor < SLP_INSTANCE_UNROLLING_FACTOR (instance)) 2204 unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (instance); 2205 2206 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we 2207 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and 2208 loop-based vectorization. Such stmts will be marked as HYBRID. */ 2209 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1); 2210 decided_to_slp++; 2211 } 2212 2213 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor; 2214 2215 if (decided_to_slp && dump_enabled_p ()) 2216 dump_printf_loc (MSG_NOTE, vect_location, 2217 "Decided to SLP %d instances. Unrolling factor %d\n", 2218 decided_to_slp, unrolling_factor); 2219 2220 return (decided_to_slp > 0); 2221 } 2222 2223 2224 /* Find stmts that must be both vectorized and SLPed (since they feed stmts that 2225 can't be SLPed) in the tree rooted at NODE. Mark such stmts as HYBRID. */ 2226 2227 static void 2228 vect_detect_hybrid_slp_stmts (slp_tree node, unsigned i, slp_vect_type stype) 2229 { 2230 gimple *stmt = SLP_TREE_SCALAR_STMTS (node)[i]; 2231 imm_use_iterator imm_iter; 2232 gimple *use_stmt; 2233 stmt_vec_info use_vinfo, stmt_vinfo = vinfo_for_stmt (stmt); 2234 slp_tree child; 2235 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo); 2236 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 2237 int j; 2238 2239 /* Propagate hybrid down the SLP tree. */ 2240 if (stype == hybrid) 2241 ; 2242 else if (HYBRID_SLP_STMT (stmt_vinfo)) 2243 stype = hybrid; 2244 else 2245 { 2246 /* Check if a pure SLP stmt has uses in non-SLP stmts. */ 2247 gcc_checking_assert (PURE_SLP_STMT (stmt_vinfo)); 2248 /* If we get a pattern stmt here we have to use the LHS of the 2249 original stmt for immediate uses. */ 2250 if (! STMT_VINFO_IN_PATTERN_P (stmt_vinfo) 2251 && STMT_VINFO_RELATED_STMT (stmt_vinfo)) 2252 stmt = STMT_VINFO_RELATED_STMT (stmt_vinfo); 2253 if (TREE_CODE (gimple_op (stmt, 0)) == SSA_NAME) 2254 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, gimple_op (stmt, 0)) 2255 { 2256 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))) 2257 continue; 2258 use_vinfo = vinfo_for_stmt (use_stmt); 2259 if (STMT_VINFO_IN_PATTERN_P (use_vinfo) 2260 && STMT_VINFO_RELATED_STMT (use_vinfo)) 2261 use_vinfo = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (use_vinfo)); 2262 if (!STMT_SLP_TYPE (use_vinfo) 2263 && (STMT_VINFO_RELEVANT (use_vinfo) 2264 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo))) 2265 && !(gimple_code (use_stmt) == GIMPLE_PHI 2266 && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def)) 2267 { 2268 if (dump_enabled_p ()) 2269 { 2270 dump_printf_loc (MSG_NOTE, vect_location, "use of SLP " 2271 "def in non-SLP stmt: "); 2272 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, use_stmt, 0); 2273 } 2274 stype = hybrid; 2275 } 2276 } 2277 } 2278 2279 if (stype == hybrid 2280 && !HYBRID_SLP_STMT (stmt_vinfo)) 2281 { 2282 if (dump_enabled_p ()) 2283 { 2284 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: "); 2285 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0); 2286 } 2287 STMT_SLP_TYPE (stmt_vinfo) = hybrid; 2288 } 2289 2290 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child) 2291 if (SLP_TREE_DEF_TYPE (child) != vect_external_def) 2292 vect_detect_hybrid_slp_stmts (child, i, stype); 2293 } 2294 2295 /* Helpers for vect_detect_hybrid_slp walking pattern stmt uses. */ 2296 2297 static tree 2298 vect_detect_hybrid_slp_1 (tree *tp, int *, void *data) 2299 { 2300 walk_stmt_info *wi = (walk_stmt_info *)data; 2301 struct loop *loopp = (struct loop *)wi->info; 2302 2303 if (wi->is_lhs) 2304 return NULL_TREE; 2305 2306 if (TREE_CODE (*tp) == SSA_NAME 2307 && !SSA_NAME_IS_DEFAULT_DEF (*tp)) 2308 { 2309 gimple *def_stmt = SSA_NAME_DEF_STMT (*tp); 2310 if (flow_bb_inside_loop_p (loopp, gimple_bb (def_stmt)) 2311 && PURE_SLP_STMT (vinfo_for_stmt (def_stmt))) 2312 { 2313 if (dump_enabled_p ()) 2314 { 2315 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: "); 2316 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0); 2317 } 2318 STMT_SLP_TYPE (vinfo_for_stmt (def_stmt)) = hybrid; 2319 } 2320 } 2321 2322 return NULL_TREE; 2323 } 2324 2325 static tree 2326 vect_detect_hybrid_slp_2 (gimple_stmt_iterator *gsi, bool *handled, 2327 walk_stmt_info *) 2328 { 2329 /* If the stmt is in a SLP instance then this isn't a reason 2330 to mark use definitions in other SLP instances as hybrid. */ 2331 if (STMT_SLP_TYPE (vinfo_for_stmt (gsi_stmt (*gsi))) != loop_vect) 2332 *handled = true; 2333 return NULL_TREE; 2334 } 2335 2336 /* Find stmts that must be both vectorized and SLPed. */ 2337 2338 void 2339 vect_detect_hybrid_slp (loop_vec_info loop_vinfo) 2340 { 2341 unsigned int i; 2342 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo); 2343 slp_instance instance; 2344 2345 if (dump_enabled_p ()) 2346 dump_printf_loc (MSG_NOTE, vect_location, "=== vect_detect_hybrid_slp ===" 2347 "\n"); 2348 2349 /* First walk all pattern stmt in the loop and mark defs of uses as 2350 hybrid because immediate uses in them are not recorded. */ 2351 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i) 2352 { 2353 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i]; 2354 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi); 2355 gsi_next (&gsi)) 2356 { 2357 gimple *stmt = gsi_stmt (gsi); 2358 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 2359 if (STMT_VINFO_IN_PATTERN_P (stmt_info)) 2360 { 2361 walk_stmt_info wi; 2362 memset (&wi, 0, sizeof (wi)); 2363 wi.info = LOOP_VINFO_LOOP (loop_vinfo); 2364 gimple_stmt_iterator gsi2 2365 = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info)); 2366 walk_gimple_stmt (&gsi2, vect_detect_hybrid_slp_2, 2367 vect_detect_hybrid_slp_1, &wi); 2368 walk_gimple_seq (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info), 2369 vect_detect_hybrid_slp_2, 2370 vect_detect_hybrid_slp_1, &wi); 2371 } 2372 } 2373 } 2374 2375 /* Then walk the SLP instance trees marking stmts with uses in 2376 non-SLP stmts as hybrid, also propagating hybrid down the 2377 SLP tree, collecting the above info on-the-fly. */ 2378 FOR_EACH_VEC_ELT (slp_instances, i, instance) 2379 { 2380 for (unsigned i = 0; i < SLP_INSTANCE_GROUP_SIZE (instance); ++i) 2381 vect_detect_hybrid_slp_stmts (SLP_INSTANCE_TREE (instance), 2382 i, pure_slp); 2383 } 2384 } 2385 2386 2387 /* Create and initialize a new bb_vec_info struct for BB, as well as 2388 stmt_vec_info structs for all the stmts in it. */ 2389 2390 static bb_vec_info 2391 new_bb_vec_info (gimple_stmt_iterator region_begin, 2392 gimple_stmt_iterator region_end) 2393 { 2394 basic_block bb = gsi_bb (region_begin); 2395 bb_vec_info res = NULL; 2396 gimple_stmt_iterator gsi; 2397 2398 res = (bb_vec_info) xcalloc (1, sizeof (struct _bb_vec_info)); 2399 res->kind = vec_info::bb; 2400 BB_VINFO_BB (res) = bb; 2401 res->region_begin = region_begin; 2402 res->region_end = region_end; 2403 2404 for (gsi = region_begin; gsi_stmt (gsi) != gsi_stmt (region_end); 2405 gsi_next (&gsi)) 2406 { 2407 gimple *stmt = gsi_stmt (gsi); 2408 gimple_set_uid (stmt, 0); 2409 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res)); 2410 } 2411 2412 BB_VINFO_GROUPED_STORES (res).create (10); 2413 BB_VINFO_SLP_INSTANCES (res).create (2); 2414 BB_VINFO_TARGET_COST_DATA (res) = init_cost (NULL); 2415 2416 bb->aux = res; 2417 return res; 2418 } 2419 2420 2421 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the 2422 stmts in the basic block. */ 2423 2424 static void 2425 destroy_bb_vec_info (bb_vec_info bb_vinfo) 2426 { 2427 slp_instance instance; 2428 unsigned i; 2429 2430 if (!bb_vinfo) 2431 return; 2432 2433 vect_destroy_datarefs (bb_vinfo); 2434 free_dependence_relations (BB_VINFO_DDRS (bb_vinfo)); 2435 BB_VINFO_GROUPED_STORES (bb_vinfo).release (); 2436 FOR_EACH_VEC_ELT (BB_VINFO_SLP_INSTANCES (bb_vinfo), i, instance) 2437 vect_free_slp_instance (instance); 2438 BB_VINFO_SLP_INSTANCES (bb_vinfo).release (); 2439 destroy_cost_data (BB_VINFO_TARGET_COST_DATA (bb_vinfo)); 2440 2441 for (gimple_stmt_iterator si = bb_vinfo->region_begin; 2442 gsi_stmt (si) != gsi_stmt (bb_vinfo->region_end); gsi_next (&si)) 2443 { 2444 gimple *stmt = gsi_stmt (si); 2445 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 2446 2447 if (stmt_info) 2448 /* Free stmt_vec_info. */ 2449 free_stmt_vec_info (stmt); 2450 2451 /* Reset region marker. */ 2452 gimple_set_uid (stmt, -1); 2453 } 2454 2455 BB_VINFO_BB (bb_vinfo)->aux = NULL; 2456 free (bb_vinfo); 2457 } 2458 2459 2460 /* Analyze statements contained in SLP tree node after recursively analyzing 2461 the subtree. Return TRUE if the operations are supported. */ 2462 2463 static bool 2464 vect_slp_analyze_node_operations (slp_tree node) 2465 { 2466 bool dummy; 2467 int i, j; 2468 gimple *stmt; 2469 slp_tree child; 2470 2471 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def) 2472 return true; 2473 2474 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) 2475 if (!vect_slp_analyze_node_operations (child)) 2476 return false; 2477 2478 bool res = true; 2479 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt) 2480 { 2481 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 2482 gcc_assert (stmt_info); 2483 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect); 2484 2485 /* Push SLP node def-type to stmt operands. */ 2486 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child) 2487 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def) 2488 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (child)[i])) 2489 = SLP_TREE_DEF_TYPE (child); 2490 res = vect_analyze_stmt (stmt, &dummy, node); 2491 /* Restore def-types. */ 2492 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child) 2493 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def) 2494 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (child)[i])) 2495 = vect_internal_def; 2496 if (! res) 2497 break; 2498 } 2499 2500 return res; 2501 } 2502 2503 2504 /* Analyze statements in SLP instances of the basic block. Return TRUE if the 2505 operations are supported. */ 2506 2507 bool 2508 vect_slp_analyze_operations (vec<slp_instance> slp_instances, void *data) 2509 { 2510 slp_instance instance; 2511 int i; 2512 2513 if (dump_enabled_p ()) 2514 dump_printf_loc (MSG_NOTE, vect_location, 2515 "=== vect_slp_analyze_operations ===\n"); 2516 2517 for (i = 0; slp_instances.iterate (i, &instance); ) 2518 { 2519 if (!vect_slp_analyze_node_operations (SLP_INSTANCE_TREE (instance))) 2520 { 2521 dump_printf_loc (MSG_NOTE, vect_location, 2522 "removing SLP instance operations starting from: "); 2523 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, 2524 SLP_TREE_SCALAR_STMTS 2525 (SLP_INSTANCE_TREE (instance))[0], 0); 2526 vect_free_slp_instance (instance); 2527 slp_instances.ordered_remove (i); 2528 } 2529 else 2530 { 2531 /* Compute the costs of the SLP instance. */ 2532 vect_analyze_slp_cost (instance, data); 2533 i++; 2534 } 2535 } 2536 2537 if (!slp_instances.length ()) 2538 return false; 2539 2540 return true; 2541 } 2542 2543 2544 /* Compute the scalar cost of the SLP node NODE and its children 2545 and return it. Do not account defs that are marked in LIFE and 2546 update LIFE according to uses of NODE. */ 2547 2548 static unsigned 2549 vect_bb_slp_scalar_cost (basic_block bb, 2550 slp_tree node, vec<bool, va_heap> *life) 2551 { 2552 unsigned scalar_cost = 0; 2553 unsigned i; 2554 gimple *stmt; 2555 slp_tree child; 2556 2557 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt) 2558 { 2559 unsigned stmt_cost; 2560 ssa_op_iter op_iter; 2561 def_operand_p def_p; 2562 stmt_vec_info stmt_info; 2563 2564 if ((*life)[i]) 2565 continue; 2566 2567 /* If there is a non-vectorized use of the defs then the scalar 2568 stmt is kept live in which case we do not account it or any 2569 required defs in the SLP children in the scalar cost. This 2570 way we make the vectorization more costly when compared to 2571 the scalar cost. */ 2572 FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF) 2573 { 2574 imm_use_iterator use_iter; 2575 gimple *use_stmt; 2576 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p)) 2577 if (!is_gimple_debug (use_stmt) 2578 && (! vect_stmt_in_region_p (vinfo_for_stmt (stmt)->vinfo, 2579 use_stmt) 2580 || ! PURE_SLP_STMT (vinfo_for_stmt (use_stmt)))) 2581 { 2582 (*life)[i] = true; 2583 BREAK_FROM_IMM_USE_STMT (use_iter); 2584 } 2585 } 2586 if ((*life)[i]) 2587 continue; 2588 2589 /* Count scalar stmts only once. */ 2590 if (gimple_visited_p (stmt)) 2591 continue; 2592 gimple_set_visited (stmt, true); 2593 2594 stmt_info = vinfo_for_stmt (stmt); 2595 if (STMT_VINFO_DATA_REF (stmt_info)) 2596 { 2597 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))) 2598 stmt_cost = vect_get_stmt_cost (scalar_load); 2599 else 2600 stmt_cost = vect_get_stmt_cost (scalar_store); 2601 } 2602 else 2603 stmt_cost = vect_get_stmt_cost (scalar_stmt); 2604 2605 scalar_cost += stmt_cost; 2606 } 2607 2608 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) 2609 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def) 2610 scalar_cost += vect_bb_slp_scalar_cost (bb, child, life); 2611 2612 return scalar_cost; 2613 } 2614 2615 /* Check if vectorization of the basic block is profitable. */ 2616 2617 static bool 2618 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo) 2619 { 2620 vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo); 2621 slp_instance instance; 2622 int i; 2623 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0; 2624 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0; 2625 2626 /* Calculate scalar cost. */ 2627 FOR_EACH_VEC_ELT (slp_instances, i, instance) 2628 { 2629 auto_vec<bool, 20> life; 2630 life.safe_grow_cleared (SLP_INSTANCE_GROUP_SIZE (instance)); 2631 scalar_cost += vect_bb_slp_scalar_cost (BB_VINFO_BB (bb_vinfo), 2632 SLP_INSTANCE_TREE (instance), 2633 &life); 2634 } 2635 2636 /* Unset visited flag. */ 2637 for (gimple_stmt_iterator gsi = bb_vinfo->region_begin; 2638 gsi_stmt (gsi) != gsi_stmt (bb_vinfo->region_end); gsi_next (&gsi)) 2639 gimple_set_visited (gsi_stmt (gsi), false); 2640 2641 /* Complete the target-specific cost calculation. */ 2642 finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost, 2643 &vec_inside_cost, &vec_epilogue_cost); 2644 2645 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost; 2646 2647 if (dump_enabled_p ()) 2648 { 2649 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n"); 2650 dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n", 2651 vec_inside_cost); 2652 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", vec_prologue_cost); 2653 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", vec_epilogue_cost); 2654 dump_printf (MSG_NOTE, " Scalar cost of basic block: %d\n", scalar_cost); 2655 } 2656 2657 /* Vectorization is profitable if its cost is more than the cost of scalar 2658 version. Note that we err on the vector side for equal cost because 2659 the cost estimate is otherwise quite pessimistic (constant uses are 2660 free on the scalar side but cost a load on the vector side for 2661 example). */ 2662 if (vec_outside_cost + vec_inside_cost > scalar_cost) 2663 return false; 2664 2665 return true; 2666 } 2667 2668 /* Check if the basic block can be vectorized. Returns a bb_vec_info 2669 if so and sets fatal to true if failure is independent of 2670 current_vector_size. */ 2671 2672 static bb_vec_info 2673 vect_slp_analyze_bb_1 (gimple_stmt_iterator region_begin, 2674 gimple_stmt_iterator region_end, 2675 vec<data_reference_p> datarefs, int n_stmts, 2676 bool &fatal) 2677 { 2678 bb_vec_info bb_vinfo; 2679 slp_instance instance; 2680 int i; 2681 int min_vf = 2; 2682 2683 /* The first group of checks is independent of the vector size. */ 2684 fatal = true; 2685 2686 if (n_stmts > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB)) 2687 { 2688 if (dump_enabled_p ()) 2689 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2690 "not vectorized: too many instructions in " 2691 "basic block.\n"); 2692 free_data_refs (datarefs); 2693 return NULL; 2694 } 2695 2696 bb_vinfo = new_bb_vec_info (region_begin, region_end); 2697 if (!bb_vinfo) 2698 return NULL; 2699 2700 BB_VINFO_DATAREFS (bb_vinfo) = datarefs; 2701 2702 /* Analyze the data references. */ 2703 2704 if (!vect_analyze_data_refs (bb_vinfo, &min_vf)) 2705 { 2706 if (dump_enabled_p ()) 2707 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2708 "not vectorized: unhandled data-ref in basic " 2709 "block.\n"); 2710 2711 destroy_bb_vec_info (bb_vinfo); 2712 return NULL; 2713 } 2714 2715 if (BB_VINFO_DATAREFS (bb_vinfo).length () < 2) 2716 { 2717 if (dump_enabled_p ()) 2718 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2719 "not vectorized: not enough data-refs in " 2720 "basic block.\n"); 2721 2722 destroy_bb_vec_info (bb_vinfo); 2723 return NULL; 2724 } 2725 2726 if (!vect_analyze_data_ref_accesses (bb_vinfo)) 2727 { 2728 if (dump_enabled_p ()) 2729 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2730 "not vectorized: unhandled data access in " 2731 "basic block.\n"); 2732 2733 destroy_bb_vec_info (bb_vinfo); 2734 return NULL; 2735 } 2736 2737 /* If there are no grouped stores in the region there is no need 2738 to continue with pattern recog as vect_analyze_slp will fail 2739 anyway. */ 2740 if (bb_vinfo->grouped_stores.is_empty ()) 2741 { 2742 if (dump_enabled_p ()) 2743 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2744 "not vectorized: no grouped stores in " 2745 "basic block.\n"); 2746 2747 destroy_bb_vec_info (bb_vinfo); 2748 return NULL; 2749 } 2750 2751 /* While the rest of the analysis below depends on it in some way. */ 2752 fatal = false; 2753 2754 vect_pattern_recog (bb_vinfo); 2755 2756 /* Check the SLP opportunities in the basic block, analyze and build SLP 2757 trees. */ 2758 if (!vect_analyze_slp (bb_vinfo, n_stmts)) 2759 { 2760 if (dump_enabled_p ()) 2761 { 2762 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2763 "Failed to SLP the basic block.\n"); 2764 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2765 "not vectorized: failed to find SLP opportunities " 2766 "in basic block.\n"); 2767 } 2768 2769 destroy_bb_vec_info (bb_vinfo); 2770 return NULL; 2771 } 2772 2773 /* Analyze and verify the alignment of data references and the 2774 dependence in the SLP instances. */ 2775 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); ) 2776 { 2777 if (! vect_slp_analyze_and_verify_instance_alignment (instance) 2778 || ! vect_slp_analyze_instance_dependence (instance)) 2779 { 2780 dump_printf_loc (MSG_NOTE, vect_location, 2781 "removing SLP instance operations starting from: "); 2782 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, 2783 SLP_TREE_SCALAR_STMTS 2784 (SLP_INSTANCE_TREE (instance))[0], 0); 2785 vect_free_slp_instance (instance); 2786 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i); 2787 continue; 2788 } 2789 2790 /* Mark all the statements that we want to vectorize as pure SLP and 2791 relevant. */ 2792 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1); 2793 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance)); 2794 2795 i++; 2796 } 2797 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ()) 2798 { 2799 destroy_bb_vec_info (bb_vinfo); 2800 return NULL; 2801 } 2802 2803 if (!vect_slp_analyze_operations (BB_VINFO_SLP_INSTANCES (bb_vinfo), 2804 BB_VINFO_TARGET_COST_DATA (bb_vinfo))) 2805 { 2806 if (dump_enabled_p ()) 2807 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2808 "not vectorized: bad operation in basic block.\n"); 2809 2810 destroy_bb_vec_info (bb_vinfo); 2811 return NULL; 2812 } 2813 2814 /* Cost model: check if the vectorization is worthwhile. */ 2815 if (!unlimited_cost_model (NULL) 2816 && !vect_bb_vectorization_profitable_p (bb_vinfo)) 2817 { 2818 if (dump_enabled_p ()) 2819 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2820 "not vectorized: vectorization is not " 2821 "profitable.\n"); 2822 2823 destroy_bb_vec_info (bb_vinfo); 2824 return NULL; 2825 } 2826 2827 if (dump_enabled_p ()) 2828 dump_printf_loc (MSG_NOTE, vect_location, 2829 "Basic block will be vectorized using SLP\n"); 2830 2831 return bb_vinfo; 2832 } 2833 2834 2835 /* Main entry for the BB vectorizer. Analyze and transform BB, returns 2836 true if anything in the basic-block was vectorized. */ 2837 2838 bool 2839 vect_slp_bb (basic_block bb) 2840 { 2841 bb_vec_info bb_vinfo; 2842 gimple_stmt_iterator gsi; 2843 unsigned int vector_sizes; 2844 bool any_vectorized = false; 2845 2846 if (dump_enabled_p ()) 2847 dump_printf_loc (MSG_NOTE, vect_location, "===vect_slp_analyze_bb===\n"); 2848 2849 /* Autodetect first vector size we try. */ 2850 current_vector_size = 0; 2851 vector_sizes = targetm.vectorize.autovectorize_vector_sizes (); 2852 2853 gsi = gsi_start_bb (bb); 2854 2855 while (1) 2856 { 2857 if (gsi_end_p (gsi)) 2858 break; 2859 2860 gimple_stmt_iterator region_begin = gsi; 2861 vec<data_reference_p> datarefs = vNULL; 2862 int insns = 0; 2863 2864 for (; !gsi_end_p (gsi); gsi_next (&gsi)) 2865 { 2866 gimple *stmt = gsi_stmt (gsi); 2867 if (is_gimple_debug (stmt)) 2868 continue; 2869 insns++; 2870 2871 if (gimple_location (stmt) != UNKNOWN_LOCATION) 2872 vect_location = gimple_location (stmt); 2873 2874 if (!find_data_references_in_stmt (NULL, stmt, &datarefs)) 2875 break; 2876 } 2877 2878 /* Skip leading unhandled stmts. */ 2879 if (gsi_stmt (region_begin) == gsi_stmt (gsi)) 2880 { 2881 gsi_next (&gsi); 2882 continue; 2883 } 2884 2885 gimple_stmt_iterator region_end = gsi; 2886 2887 bool vectorized = false; 2888 bool fatal = false; 2889 bb_vinfo = vect_slp_analyze_bb_1 (region_begin, region_end, 2890 datarefs, insns, fatal); 2891 if (bb_vinfo 2892 && dbg_cnt (vect_slp)) 2893 { 2894 if (dump_enabled_p ()) 2895 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n"); 2896 2897 vect_schedule_slp (bb_vinfo); 2898 2899 if (dump_enabled_p ()) 2900 dump_printf_loc (MSG_NOTE, vect_location, 2901 "basic block part vectorized\n"); 2902 2903 destroy_bb_vec_info (bb_vinfo); 2904 2905 vectorized = true; 2906 } 2907 else 2908 destroy_bb_vec_info (bb_vinfo); 2909 2910 any_vectorized |= vectorized; 2911 2912 vector_sizes &= ~current_vector_size; 2913 if (vectorized 2914 || vector_sizes == 0 2915 || current_vector_size == 0 2916 /* If vect_slp_analyze_bb_1 signaled that analysis for all 2917 vector sizes will fail do not bother iterating. */ 2918 || fatal) 2919 { 2920 if (gsi_end_p (region_end)) 2921 break; 2922 2923 /* Skip the unhandled stmt. */ 2924 gsi_next (&gsi); 2925 2926 /* And reset vector sizes. */ 2927 current_vector_size = 0; 2928 vector_sizes = targetm.vectorize.autovectorize_vector_sizes (); 2929 } 2930 else 2931 { 2932 /* Try the next biggest vector size. */ 2933 current_vector_size = 1 << floor_log2 (vector_sizes); 2934 if (dump_enabled_p ()) 2935 dump_printf_loc (MSG_NOTE, vect_location, 2936 "***** Re-trying analysis with " 2937 "vector size %d\n", current_vector_size); 2938 2939 /* Start over. */ 2940 gsi = region_begin; 2941 } 2942 } 2943 2944 return any_vectorized; 2945 } 2946 2947 2948 /* Return 1 if vector type of boolean constant which is OPNUM 2949 operand in statement STMT is a boolean vector. */ 2950 2951 static bool 2952 vect_mask_constant_operand_p (gimple *stmt, int opnum) 2953 { 2954 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt); 2955 enum tree_code code = gimple_expr_code (stmt); 2956 tree op, vectype; 2957 gimple *def_stmt; 2958 enum vect_def_type dt; 2959 2960 /* For comparison and COND_EXPR type is chosen depending 2961 on the other comparison operand. */ 2962 if (TREE_CODE_CLASS (code) == tcc_comparison) 2963 { 2964 if (opnum) 2965 op = gimple_assign_rhs1 (stmt); 2966 else 2967 op = gimple_assign_rhs2 (stmt); 2968 2969 if (!vect_is_simple_use (op, stmt_vinfo->vinfo, &def_stmt, 2970 &dt, &vectype)) 2971 gcc_unreachable (); 2972 2973 return !vectype || VECTOR_BOOLEAN_TYPE_P (vectype); 2974 } 2975 2976 if (code == COND_EXPR) 2977 { 2978 tree cond = gimple_assign_rhs1 (stmt); 2979 2980 if (TREE_CODE (cond) == SSA_NAME) 2981 op = cond; 2982 else if (opnum) 2983 op = TREE_OPERAND (cond, 1); 2984 else 2985 op = TREE_OPERAND (cond, 0); 2986 2987 if (!vect_is_simple_use (op, stmt_vinfo->vinfo, &def_stmt, 2988 &dt, &vectype)) 2989 gcc_unreachable (); 2990 2991 return !vectype || VECTOR_BOOLEAN_TYPE_P (vectype); 2992 } 2993 2994 return VECTOR_BOOLEAN_TYPE_P (STMT_VINFO_VECTYPE (stmt_vinfo)); 2995 } 2996 2997 2998 /* For constant and loop invariant defs of SLP_NODE this function returns 2999 (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts. 3000 OP_NUM determines if we gather defs for operand 0 or operand 1 of the RHS of 3001 scalar stmts. NUMBER_OF_VECTORS is the number of vector defs to create. 3002 REDUC_INDEX is the index of the reduction operand in the statements, unless 3003 it is -1. */ 3004 3005 static void 3006 vect_get_constant_vectors (tree op, slp_tree slp_node, 3007 vec<tree> *vec_oprnds, 3008 unsigned int op_num, unsigned int number_of_vectors, 3009 int reduc_index) 3010 { 3011 vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node); 3012 gimple *stmt = stmts[0]; 3013 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt); 3014 unsigned nunits; 3015 tree vec_cst; 3016 tree *elts; 3017 unsigned j, number_of_places_left_in_vector; 3018 tree vector_type; 3019 tree vop; 3020 int group_size = stmts.length (); 3021 unsigned int vec_num, i; 3022 unsigned number_of_copies = 1; 3023 vec<tree> voprnds; 3024 voprnds.create (number_of_vectors); 3025 bool constant_p, is_store; 3026 tree neutral_op = NULL; 3027 enum tree_code code = gimple_expr_code (stmt); 3028 gimple *def_stmt; 3029 struct loop *loop; 3030 gimple_seq ctor_seq = NULL; 3031 3032 /* Check if vector type is a boolean vector. */ 3033 if (VECT_SCALAR_BOOLEAN_TYPE_P (TREE_TYPE (op)) 3034 && vect_mask_constant_operand_p (stmt, op_num)) 3035 vector_type 3036 = build_same_sized_truth_vector_type (STMT_VINFO_VECTYPE (stmt_vinfo)); 3037 else 3038 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op)); 3039 nunits = TYPE_VECTOR_SUBPARTS (vector_type); 3040 3041 if (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def 3042 && reduc_index != -1) 3043 { 3044 op_num = reduc_index; 3045 op = gimple_op (stmt, op_num + 1); 3046 /* For additional copies (see the explanation of NUMBER_OF_COPIES below) 3047 we need either neutral operands or the original operands. See 3048 get_initial_def_for_reduction() for details. */ 3049 switch (code) 3050 { 3051 case WIDEN_SUM_EXPR: 3052 case DOT_PROD_EXPR: 3053 case SAD_EXPR: 3054 case PLUS_EXPR: 3055 case MINUS_EXPR: 3056 case BIT_IOR_EXPR: 3057 case BIT_XOR_EXPR: 3058 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op))) 3059 neutral_op = build_real (TREE_TYPE (op), dconst0); 3060 else 3061 neutral_op = build_int_cst (TREE_TYPE (op), 0); 3062 3063 break; 3064 3065 case MULT_EXPR: 3066 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op))) 3067 neutral_op = build_real (TREE_TYPE (op), dconst1); 3068 else 3069 neutral_op = build_int_cst (TREE_TYPE (op), 1); 3070 3071 break; 3072 3073 case BIT_AND_EXPR: 3074 neutral_op = build_int_cst (TREE_TYPE (op), -1); 3075 break; 3076 3077 /* For MIN/MAX we don't have an easy neutral operand but 3078 the initial values can be used fine here. Only for 3079 a reduction chain we have to force a neutral element. */ 3080 case MAX_EXPR: 3081 case MIN_EXPR: 3082 if (!GROUP_FIRST_ELEMENT (stmt_vinfo)) 3083 neutral_op = NULL; 3084 else 3085 { 3086 def_stmt = SSA_NAME_DEF_STMT (op); 3087 loop = (gimple_bb (stmt))->loop_father; 3088 neutral_op = PHI_ARG_DEF_FROM_EDGE (def_stmt, 3089 loop_preheader_edge (loop)); 3090 } 3091 break; 3092 3093 default: 3094 gcc_assert (!GROUP_FIRST_ELEMENT (stmt_vinfo)); 3095 neutral_op = NULL; 3096 } 3097 } 3098 3099 if (STMT_VINFO_DATA_REF (stmt_vinfo)) 3100 { 3101 is_store = true; 3102 op = gimple_assign_rhs1 (stmt); 3103 } 3104 else 3105 is_store = false; 3106 3107 gcc_assert (op); 3108 3109 if (CONSTANT_CLASS_P (op)) 3110 constant_p = true; 3111 else 3112 constant_p = false; 3113 3114 /* NUMBER_OF_COPIES is the number of times we need to use the same values in 3115 created vectors. It is greater than 1 if unrolling is performed. 3116 3117 For example, we have two scalar operands, s1 and s2 (e.g., group of 3118 strided accesses of size two), while NUNITS is four (i.e., four scalars 3119 of this type can be packed in a vector). The output vector will contain 3120 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES 3121 will be 2). 3122 3123 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors 3124 containing the operands. 3125 3126 For example, NUNITS is four as before, and the group size is 8 3127 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and 3128 {s5, s6, s7, s8}. */ 3129 3130 number_of_copies = nunits * number_of_vectors / group_size; 3131 3132 number_of_places_left_in_vector = nunits; 3133 elts = XALLOCAVEC (tree, nunits); 3134 bool place_after_defs = false; 3135 for (j = 0; j < number_of_copies; j++) 3136 { 3137 for (i = group_size - 1; stmts.iterate (i, &stmt); i--) 3138 { 3139 if (is_store) 3140 op = gimple_assign_rhs1 (stmt); 3141 else 3142 { 3143 switch (code) 3144 { 3145 case COND_EXPR: 3146 { 3147 tree cond = gimple_assign_rhs1 (stmt); 3148 if (TREE_CODE (cond) == SSA_NAME) 3149 op = gimple_op (stmt, op_num + 1); 3150 else if (op_num == 0 || op_num == 1) 3151 op = TREE_OPERAND (cond, op_num); 3152 else 3153 { 3154 if (op_num == 2) 3155 op = gimple_assign_rhs2 (stmt); 3156 else 3157 op = gimple_assign_rhs3 (stmt); 3158 } 3159 } 3160 break; 3161 3162 case CALL_EXPR: 3163 op = gimple_call_arg (stmt, op_num); 3164 break; 3165 3166 case LSHIFT_EXPR: 3167 case RSHIFT_EXPR: 3168 case LROTATE_EXPR: 3169 case RROTATE_EXPR: 3170 op = gimple_op (stmt, op_num + 1); 3171 /* Unlike the other binary operators, shifts/rotates have 3172 the shift count being int, instead of the same type as 3173 the lhs, so make sure the scalar is the right type if 3174 we are dealing with vectors of 3175 long long/long/short/char. */ 3176 if (op_num == 1 && TREE_CODE (op) == INTEGER_CST) 3177 op = fold_convert (TREE_TYPE (vector_type), op); 3178 break; 3179 3180 default: 3181 op = gimple_op (stmt, op_num + 1); 3182 break; 3183 } 3184 } 3185 3186 if (reduc_index != -1) 3187 { 3188 loop = (gimple_bb (stmt))->loop_father; 3189 def_stmt = SSA_NAME_DEF_STMT (op); 3190 3191 gcc_assert (loop); 3192 3193 /* Get the def before the loop. In reduction chain we have only 3194 one initial value. */ 3195 if ((j != (number_of_copies - 1) 3196 || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) 3197 && i != 0)) 3198 && neutral_op) 3199 op = neutral_op; 3200 else 3201 op = PHI_ARG_DEF_FROM_EDGE (def_stmt, 3202 loop_preheader_edge (loop)); 3203 } 3204 3205 /* Create 'vect_ = {op0,op1,...,opn}'. */ 3206 number_of_places_left_in_vector--; 3207 tree orig_op = op; 3208 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op))) 3209 { 3210 if (CONSTANT_CLASS_P (op)) 3211 { 3212 if (VECTOR_BOOLEAN_TYPE_P (vector_type)) 3213 { 3214 /* Can't use VIEW_CONVERT_EXPR for booleans because 3215 of possibly different sizes of scalar value and 3216 vector element. */ 3217 if (integer_zerop (op)) 3218 op = build_int_cst (TREE_TYPE (vector_type), 0); 3219 else if (integer_onep (op)) 3220 op = build_all_ones_cst (TREE_TYPE (vector_type)); 3221 else 3222 gcc_unreachable (); 3223 } 3224 else 3225 op = fold_unary (VIEW_CONVERT_EXPR, 3226 TREE_TYPE (vector_type), op); 3227 gcc_assert (op && CONSTANT_CLASS_P (op)); 3228 } 3229 else 3230 { 3231 tree new_temp = make_ssa_name (TREE_TYPE (vector_type)); 3232 gimple *init_stmt; 3233 if (VECTOR_BOOLEAN_TYPE_P (vector_type)) 3234 { 3235 tree true_val 3236 = build_all_ones_cst (TREE_TYPE (vector_type)); 3237 tree false_val 3238 = build_zero_cst (TREE_TYPE (vector_type)); 3239 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op))); 3240 init_stmt = gimple_build_assign (new_temp, COND_EXPR, 3241 op, true_val, 3242 false_val); 3243 } 3244 else 3245 { 3246 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type), 3247 op); 3248 init_stmt 3249 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR, 3250 op); 3251 } 3252 gimple_seq_add_stmt (&ctor_seq, init_stmt); 3253 op = new_temp; 3254 } 3255 } 3256 elts[number_of_places_left_in_vector] = op; 3257 if (!CONSTANT_CLASS_P (op)) 3258 constant_p = false; 3259 if (TREE_CODE (orig_op) == SSA_NAME 3260 && !SSA_NAME_IS_DEFAULT_DEF (orig_op) 3261 && STMT_VINFO_BB_VINFO (stmt_vinfo) 3262 && (STMT_VINFO_BB_VINFO (stmt_vinfo)->bb 3263 == gimple_bb (SSA_NAME_DEF_STMT (orig_op)))) 3264 place_after_defs = true; 3265 3266 if (number_of_places_left_in_vector == 0) 3267 { 3268 number_of_places_left_in_vector = nunits; 3269 3270 if (constant_p) 3271 vec_cst = build_vector (vector_type, elts); 3272 else 3273 { 3274 vec<constructor_elt, va_gc> *v; 3275 unsigned k; 3276 vec_alloc (v, nunits); 3277 for (k = 0; k < nunits; ++k) 3278 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[k]); 3279 vec_cst = build_constructor (vector_type, v); 3280 } 3281 tree init; 3282 gimple_stmt_iterator gsi; 3283 if (place_after_defs) 3284 { 3285 gsi = gsi_for_stmt 3286 (vect_find_last_scalar_stmt_in_slp (slp_node)); 3287 init = vect_init_vector (stmt, vec_cst, vector_type, &gsi); 3288 } 3289 else 3290 init = vect_init_vector (stmt, vec_cst, vector_type, NULL); 3291 if (ctor_seq != NULL) 3292 { 3293 gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (init)); 3294 gsi_insert_seq_before_without_update (&gsi, ctor_seq, 3295 GSI_SAME_STMT); 3296 ctor_seq = NULL; 3297 } 3298 voprnds.quick_push (init); 3299 place_after_defs = false; 3300 } 3301 } 3302 } 3303 3304 /* Since the vectors are created in the reverse order, we should invert 3305 them. */ 3306 vec_num = voprnds.length (); 3307 for (j = vec_num; j != 0; j--) 3308 { 3309 vop = voprnds[j - 1]; 3310 vec_oprnds->quick_push (vop); 3311 } 3312 3313 voprnds.release (); 3314 3315 /* In case that VF is greater than the unrolling factor needed for the SLP 3316 group of stmts, NUMBER_OF_VECTORS to be created is greater than 3317 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have 3318 to replicate the vectors. */ 3319 while (number_of_vectors > vec_oprnds->length ()) 3320 { 3321 tree neutral_vec = NULL; 3322 3323 if (neutral_op) 3324 { 3325 if (!neutral_vec) 3326 neutral_vec = build_vector_from_val (vector_type, neutral_op); 3327 3328 vec_oprnds->quick_push (neutral_vec); 3329 } 3330 else 3331 { 3332 for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++) 3333 vec_oprnds->quick_push (vop); 3334 } 3335 } 3336 } 3337 3338 3339 /* Get vectorized definitions from SLP_NODE that contains corresponding 3340 vectorized def-stmts. */ 3341 3342 static void 3343 vect_get_slp_vect_defs (slp_tree slp_node, vec<tree> *vec_oprnds) 3344 { 3345 tree vec_oprnd; 3346 gimple *vec_def_stmt; 3347 unsigned int i; 3348 3349 gcc_assert (SLP_TREE_VEC_STMTS (slp_node).exists ()); 3350 3351 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt) 3352 { 3353 gcc_assert (vec_def_stmt); 3354 vec_oprnd = gimple_get_lhs (vec_def_stmt); 3355 vec_oprnds->quick_push (vec_oprnd); 3356 } 3357 } 3358 3359 3360 /* Get vectorized definitions for SLP_NODE. 3361 If the scalar definitions are loop invariants or constants, collect them and 3362 call vect_get_constant_vectors() to create vector stmts. 3363 Otherwise, the def-stmts must be already vectorized and the vectorized stmts 3364 must be stored in the corresponding child of SLP_NODE, and we call 3365 vect_get_slp_vect_defs () to retrieve them. */ 3366 3367 void 3368 vect_get_slp_defs (vec<tree> ops, slp_tree slp_node, 3369 vec<vec<tree> > *vec_oprnds, int reduc_index) 3370 { 3371 gimple *first_stmt; 3372 int number_of_vects = 0, i; 3373 unsigned int child_index = 0; 3374 HOST_WIDE_INT lhs_size_unit, rhs_size_unit; 3375 slp_tree child = NULL; 3376 vec<tree> vec_defs; 3377 tree oprnd; 3378 bool vectorized_defs; 3379 bool first_iteration = true; 3380 3381 first_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[0]; 3382 FOR_EACH_VEC_ELT (ops, i, oprnd) 3383 { 3384 if (oprnd == NULL) 3385 { 3386 /* Only vectorizable_reduction will call us with a NULL op which 3387 will always match the reduction operand for which we have no 3388 SLP child. */ 3389 vec_defs = vNULL; 3390 vec_defs.create (0); 3391 vec_oprnds->quick_push (vec_defs); 3392 continue; 3393 } 3394 3395 /* For each operand we check if it has vectorized definitions in a child 3396 node or we need to create them (for invariants and constants). We 3397 check if the LHS of the first stmt of the next child matches OPRND. 3398 If it does, we found the correct child. Otherwise, we call 3399 vect_get_constant_vectors (), and not advance CHILD_INDEX in order 3400 to check this child node for the next operand. */ 3401 vectorized_defs = false; 3402 if (SLP_TREE_CHILDREN (slp_node).length () > child_index) 3403 { 3404 child = SLP_TREE_CHILDREN (slp_node)[child_index]; 3405 3406 /* We have to check both pattern and original def, if available. */ 3407 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def) 3408 { 3409 gimple *first_def = SLP_TREE_SCALAR_STMTS (child)[0]; 3410 gimple *related 3411 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first_def)); 3412 3413 if (operand_equal_p (oprnd, gimple_get_lhs (first_def), 0) 3414 || (related 3415 && operand_equal_p (oprnd, gimple_get_lhs (related), 0))) 3416 { 3417 /* The number of vector defs is determined by the number of 3418 vector statements in the node from which we get those 3419 statements. */ 3420 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (child); 3421 vectorized_defs = true; 3422 child_index++; 3423 } 3424 } 3425 else 3426 child_index++; 3427 } 3428 3429 if (!vectorized_defs) 3430 { 3431 if (first_iteration) 3432 { 3433 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); 3434 /* Number of vector stmts was calculated according to LHS in 3435 vect_schedule_slp_instance (), fix it by replacing LHS with 3436 RHS, if necessary. See vect_get_smallest_scalar_type () for 3437 details. */ 3438 vect_get_smallest_scalar_type (first_stmt, &lhs_size_unit, 3439 &rhs_size_unit); 3440 if (rhs_size_unit != lhs_size_unit) 3441 { 3442 number_of_vects *= rhs_size_unit; 3443 number_of_vects /= lhs_size_unit; 3444 } 3445 } 3446 } 3447 3448 /* Allocate memory for vectorized defs. */ 3449 vec_defs = vNULL; 3450 vec_defs.create (number_of_vects); 3451 3452 /* For reduction defs we call vect_get_constant_vectors (), since we are 3453 looking for initial loop invariant values. */ 3454 if (vectorized_defs && reduc_index == -1) 3455 /* The defs are already vectorized. */ 3456 vect_get_slp_vect_defs (child, &vec_defs); 3457 else 3458 /* Build vectors from scalar defs. */ 3459 vect_get_constant_vectors (oprnd, slp_node, &vec_defs, i, 3460 number_of_vects, reduc_index); 3461 3462 vec_oprnds->quick_push (vec_defs); 3463 3464 /* For reductions, we only need initial values. */ 3465 if (reduc_index != -1) 3466 return; 3467 3468 first_iteration = false; 3469 } 3470 } 3471 3472 /* Generate vector permute statements from a list of loads in DR_CHAIN. 3473 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid 3474 permute statements for the SLP node NODE of the SLP instance 3475 SLP_NODE_INSTANCE. */ 3476 3477 bool 3478 vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain, 3479 gimple_stmt_iterator *gsi, int vf, 3480 slp_instance slp_node_instance, bool analyze_only, 3481 unsigned *n_perms) 3482 { 3483 gimple *stmt = SLP_TREE_SCALAR_STMTS (node)[0]; 3484 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 3485 tree mask_element_type = NULL_TREE, mask_type; 3486 int nunits, vec_index = 0; 3487 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 3488 int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance); 3489 int mask_element; 3490 unsigned char *mask; 3491 machine_mode mode; 3492 3493 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)) 3494 return false; 3495 3496 stmt_info = vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)); 3497 3498 mode = TYPE_MODE (vectype); 3499 3500 /* The generic VEC_PERM_EXPR code always uses an integral type of the 3501 same size as the vector element being permuted. */ 3502 mask_element_type = lang_hooks.types.type_for_mode 3503 (int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype))), 1); 3504 mask_type = get_vectype_for_scalar_type (mask_element_type); 3505 nunits = TYPE_VECTOR_SUBPARTS (vectype); 3506 mask = XALLOCAVEC (unsigned char, nunits); 3507 3508 /* Initialize the vect stmts of NODE to properly insert the generated 3509 stmts later. */ 3510 if (! analyze_only) 3511 for (unsigned i = SLP_TREE_VEC_STMTS (node).length (); 3512 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++) 3513 SLP_TREE_VEC_STMTS (node).quick_push (NULL); 3514 3515 /* Generate permutation masks for every NODE. Number of masks for each NODE 3516 is equal to GROUP_SIZE. 3517 E.g., we have a group of three nodes with three loads from the same 3518 location in each node, and the vector size is 4. I.e., we have a 3519 a0b0c0a1b1c1... sequence and we need to create the following vectors: 3520 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3 3521 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3 3522 ... 3523 3524 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}. 3525 The last mask is illegal since we assume two operands for permute 3526 operation, and the mask element values can't be outside that range. 3527 Hence, the last mask must be converted into {2,5,5,5}. 3528 For the first two permutations we need the first and the second input 3529 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation 3530 we need the second and the third vectors: {b1,c1,a2,b2} and 3531 {c2,a3,b3,c3}. */ 3532 3533 int vect_stmts_counter = 0; 3534 int index = 0; 3535 int first_vec_index = -1; 3536 int second_vec_index = -1; 3537 bool noop_p = true; 3538 *n_perms = 0; 3539 3540 for (int j = 0; j < vf; j++) 3541 { 3542 for (int k = 0; k < group_size; k++) 3543 { 3544 int i = (SLP_TREE_LOAD_PERMUTATION (node)[k] 3545 + j * STMT_VINFO_GROUP_SIZE (stmt_info)); 3546 vec_index = i / nunits; 3547 mask_element = i % nunits; 3548 if (vec_index == first_vec_index 3549 || first_vec_index == -1) 3550 { 3551 first_vec_index = vec_index; 3552 } 3553 else if (vec_index == second_vec_index 3554 || second_vec_index == -1) 3555 { 3556 second_vec_index = vec_index; 3557 mask_element += nunits; 3558 } 3559 else 3560 { 3561 if (dump_enabled_p ()) 3562 { 3563 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3564 "permutation requires at " 3565 "least three vectors "); 3566 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 3567 stmt, 0); 3568 } 3569 gcc_assert (analyze_only); 3570 return false; 3571 } 3572 3573 gcc_assert (mask_element >= 0 3574 && mask_element < 2 * nunits); 3575 if (mask_element != index) 3576 noop_p = false; 3577 mask[index++] = mask_element; 3578 3579 if (index == nunits) 3580 { 3581 if (! noop_p 3582 && ! can_vec_perm_p (mode, false, mask)) 3583 { 3584 if (dump_enabled_p ()) 3585 { 3586 dump_printf_loc (MSG_MISSED_OPTIMIZATION, 3587 vect_location, 3588 "unsupported vect permute { "); 3589 for (i = 0; i < nunits; ++i) 3590 dump_printf (MSG_MISSED_OPTIMIZATION, "%d ", mask[i]); 3591 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n"); 3592 } 3593 gcc_assert (analyze_only); 3594 return false; 3595 } 3596 3597 if (! noop_p) 3598 ++*n_perms; 3599 3600 if (!analyze_only) 3601 { 3602 tree mask_vec = NULL_TREE; 3603 3604 if (! noop_p) 3605 { 3606 tree *mask_elts = XALLOCAVEC (tree, nunits); 3607 for (int l = 0; l < nunits; ++l) 3608 mask_elts[l] = build_int_cst (mask_element_type, 3609 mask[l]); 3610 mask_vec = build_vector (mask_type, mask_elts); 3611 } 3612 3613 if (second_vec_index == -1) 3614 second_vec_index = first_vec_index; 3615 3616 /* Generate the permute statement if necessary. */ 3617 tree first_vec = dr_chain[first_vec_index]; 3618 tree second_vec = dr_chain[second_vec_index]; 3619 gimple *perm_stmt; 3620 if (! noop_p) 3621 { 3622 tree perm_dest 3623 = vect_create_destination_var (gimple_assign_lhs (stmt), 3624 vectype); 3625 perm_dest = make_ssa_name (perm_dest); 3626 perm_stmt = gimple_build_assign (perm_dest, 3627 VEC_PERM_EXPR, 3628 first_vec, second_vec, 3629 mask_vec); 3630 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 3631 } 3632 else 3633 /* If mask was NULL_TREE generate the requested 3634 identity transform. */ 3635 perm_stmt = SSA_NAME_DEF_STMT (first_vec); 3636 3637 /* Store the vector statement in NODE. */ 3638 SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt; 3639 } 3640 3641 index = 0; 3642 first_vec_index = -1; 3643 second_vec_index = -1; 3644 noop_p = true; 3645 } 3646 } 3647 } 3648 3649 return true; 3650 } 3651 3652 3653 3654 /* Vectorize SLP instance tree in postorder. */ 3655 3656 static bool 3657 vect_schedule_slp_instance (slp_tree node, slp_instance instance, 3658 unsigned int vectorization_factor) 3659 { 3660 gimple *stmt; 3661 bool grouped_store, is_store; 3662 gimple_stmt_iterator si; 3663 stmt_vec_info stmt_info; 3664 unsigned int vec_stmts_size, nunits, group_size; 3665 tree vectype; 3666 int i, j; 3667 slp_tree child; 3668 3669 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def) 3670 return false; 3671 3672 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) 3673 vect_schedule_slp_instance (child, instance, vectorization_factor); 3674 3675 /* Push SLP node def-type to stmts. */ 3676 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) 3677 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def) 3678 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, stmt) 3679 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = SLP_TREE_DEF_TYPE (child); 3680 3681 stmt = SLP_TREE_SCALAR_STMTS (node)[0]; 3682 stmt_info = vinfo_for_stmt (stmt); 3683 3684 /* VECTYPE is the type of the destination. */ 3685 vectype = STMT_VINFO_VECTYPE (stmt_info); 3686 nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (vectype); 3687 group_size = SLP_INSTANCE_GROUP_SIZE (instance); 3688 3689 /* For each SLP instance calculate number of vector stmts to be created 3690 for the scalar stmts in each node of the SLP tree. Number of vector 3691 elements in one vector iteration is the number of scalar elements in 3692 one scalar iteration (GROUP_SIZE) multiplied by VF divided by vector 3693 size. 3694 Unless this is a SLP reduction in which case the number of vector 3695 stmts is equal to the number of vector stmts of the children. */ 3696 if (GROUP_FIRST_ELEMENT (stmt_info) 3697 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)) 3698 vec_stmts_size = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[0]); 3699 else 3700 vec_stmts_size = (vectorization_factor * group_size) / nunits; 3701 3702 if (!SLP_TREE_VEC_STMTS (node).exists ()) 3703 { 3704 SLP_TREE_VEC_STMTS (node).create (vec_stmts_size); 3705 SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vec_stmts_size; 3706 } 3707 3708 if (dump_enabled_p ()) 3709 { 3710 dump_printf_loc (MSG_NOTE,vect_location, 3711 "------>vectorizing SLP node starting from: "); 3712 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0); 3713 } 3714 3715 /* Vectorized stmts go before the last scalar stmt which is where 3716 all uses are ready. */ 3717 si = gsi_for_stmt (vect_find_last_scalar_stmt_in_slp (node)); 3718 3719 /* Mark the first element of the reduction chain as reduction to properly 3720 transform the node. In the analysis phase only the last element of the 3721 chain is marked as reduction. */ 3722 if (GROUP_FIRST_ELEMENT (stmt_info) && !STMT_VINFO_GROUPED_ACCESS (stmt_info) 3723 && GROUP_FIRST_ELEMENT (stmt_info) == stmt) 3724 { 3725 STMT_VINFO_DEF_TYPE (stmt_info) = vect_reduction_def; 3726 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type; 3727 } 3728 3729 /* Handle two-operation SLP nodes by vectorizing the group with 3730 both operations and then performing a merge. */ 3731 if (SLP_TREE_TWO_OPERATORS (node)) 3732 { 3733 enum tree_code code0 = gimple_assign_rhs_code (stmt); 3734 enum tree_code ocode = ERROR_MARK; 3735 gimple *ostmt; 3736 unsigned char *mask = XALLOCAVEC (unsigned char, group_size); 3737 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, ostmt) 3738 if (gimple_assign_rhs_code (ostmt) != code0) 3739 { 3740 mask[i] = 1; 3741 ocode = gimple_assign_rhs_code (ostmt); 3742 } 3743 else 3744 mask[i] = 0; 3745 if (ocode != ERROR_MARK) 3746 { 3747 vec<gimple *> v0; 3748 vec<gimple *> v1; 3749 unsigned j; 3750 tree tmask = NULL_TREE; 3751 vect_transform_stmt (stmt, &si, &grouped_store, node, instance); 3752 v0 = SLP_TREE_VEC_STMTS (node).copy (); 3753 SLP_TREE_VEC_STMTS (node).truncate (0); 3754 gimple_assign_set_rhs_code (stmt, ocode); 3755 vect_transform_stmt (stmt, &si, &grouped_store, node, instance); 3756 gimple_assign_set_rhs_code (stmt, code0); 3757 v1 = SLP_TREE_VEC_STMTS (node).copy (); 3758 SLP_TREE_VEC_STMTS (node).truncate (0); 3759 tree meltype = build_nonstandard_integer_type 3760 (GET_MODE_BITSIZE (TYPE_MODE (TREE_TYPE (vectype))), 1); 3761 tree mvectype = get_same_sized_vectype (meltype, vectype); 3762 unsigned k = 0, l; 3763 for (j = 0; j < v0.length (); ++j) 3764 { 3765 tree *melts = XALLOCAVEC (tree, TYPE_VECTOR_SUBPARTS (vectype)); 3766 for (l = 0; l < TYPE_VECTOR_SUBPARTS (vectype); ++l) 3767 { 3768 if (k >= group_size) 3769 k = 0; 3770 melts[l] = build_int_cst 3771 (meltype, mask[k++] * TYPE_VECTOR_SUBPARTS (vectype) + l); 3772 } 3773 tmask = build_vector (mvectype, melts); 3774 3775 /* ??? Not all targets support a VEC_PERM_EXPR with a 3776 constant mask that would translate to a vec_merge RTX 3777 (with their vec_perm_const_ok). We can either not 3778 vectorize in that case or let veclower do its job. 3779 Unfortunately that isn't too great and at least for 3780 plus/minus we'd eventually like to match targets 3781 vector addsub instructions. */ 3782 gimple *vstmt; 3783 vstmt = gimple_build_assign (make_ssa_name (vectype), 3784 VEC_PERM_EXPR, 3785 gimple_assign_lhs (v0[j]), 3786 gimple_assign_lhs (v1[j]), tmask); 3787 vect_finish_stmt_generation (stmt, vstmt, &si); 3788 SLP_TREE_VEC_STMTS (node).quick_push (vstmt); 3789 } 3790 v0.release (); 3791 v1.release (); 3792 return false; 3793 } 3794 } 3795 is_store = vect_transform_stmt (stmt, &si, &grouped_store, node, instance); 3796 3797 /* Restore stmt def-types. */ 3798 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) 3799 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def) 3800 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, stmt) 3801 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = vect_internal_def; 3802 3803 return is_store; 3804 } 3805 3806 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero. 3807 For loop vectorization this is done in vectorizable_call, but for SLP 3808 it needs to be deferred until end of vect_schedule_slp, because multiple 3809 SLP instances may refer to the same scalar stmt. */ 3810 3811 static void 3812 vect_remove_slp_scalar_calls (slp_tree node) 3813 { 3814 gimple *stmt, *new_stmt; 3815 gimple_stmt_iterator gsi; 3816 int i; 3817 slp_tree child; 3818 tree lhs; 3819 stmt_vec_info stmt_info; 3820 3821 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def) 3822 return; 3823 3824 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) 3825 vect_remove_slp_scalar_calls (child); 3826 3827 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt) 3828 { 3829 if (!is_gimple_call (stmt) || gimple_bb (stmt) == NULL) 3830 continue; 3831 stmt_info = vinfo_for_stmt (stmt); 3832 if (stmt_info == NULL 3833 || is_pattern_stmt_p (stmt_info) 3834 || !PURE_SLP_STMT (stmt_info)) 3835 continue; 3836 lhs = gimple_call_lhs (stmt); 3837 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs))); 3838 set_vinfo_for_stmt (new_stmt, stmt_info); 3839 set_vinfo_for_stmt (stmt, NULL); 3840 STMT_VINFO_STMT (stmt_info) = new_stmt; 3841 gsi = gsi_for_stmt (stmt); 3842 gsi_replace (&gsi, new_stmt, false); 3843 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt; 3844 } 3845 } 3846 3847 /* Generate vector code for all SLP instances in the loop/basic block. */ 3848 3849 bool 3850 vect_schedule_slp (vec_info *vinfo) 3851 { 3852 vec<slp_instance> slp_instances; 3853 slp_instance instance; 3854 unsigned int i, vf; 3855 bool is_store = false; 3856 3857 slp_instances = vinfo->slp_instances; 3858 if (is_a <loop_vec_info> (vinfo)) 3859 vf = as_a <loop_vec_info> (vinfo)->vectorization_factor; 3860 else 3861 vf = 1; 3862 3863 FOR_EACH_VEC_ELT (slp_instances, i, instance) 3864 { 3865 /* Schedule the tree of INSTANCE. */ 3866 is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance), 3867 instance, vf); 3868 if (dump_enabled_p ()) 3869 dump_printf_loc (MSG_NOTE, vect_location, 3870 "vectorizing stmts using SLP.\n"); 3871 } 3872 3873 FOR_EACH_VEC_ELT (slp_instances, i, instance) 3874 { 3875 slp_tree root = SLP_INSTANCE_TREE (instance); 3876 gimple *store; 3877 unsigned int j; 3878 gimple_stmt_iterator gsi; 3879 3880 /* Remove scalar call stmts. Do not do this for basic-block 3881 vectorization as not all uses may be vectorized. 3882 ??? Why should this be necessary? DCE should be able to 3883 remove the stmts itself. 3884 ??? For BB vectorization we can as well remove scalar 3885 stmts starting from the SLP tree root if they have no 3886 uses. */ 3887 if (is_a <loop_vec_info> (vinfo)) 3888 vect_remove_slp_scalar_calls (root); 3889 3890 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store) 3891 && j < SLP_INSTANCE_GROUP_SIZE (instance); j++) 3892 { 3893 if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (store))) 3894 break; 3895 3896 if (is_pattern_stmt_p (vinfo_for_stmt (store))) 3897 store = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (store)); 3898 /* Free the attached stmt_vec_info and remove the stmt. */ 3899 gsi = gsi_for_stmt (store); 3900 unlink_stmt_vdef (store); 3901 gsi_remove (&gsi, true); 3902 release_defs (store); 3903 free_stmt_vec_info (store); 3904 } 3905 } 3906 3907 return is_store; 3908 } 3909