1 /* Loop Vectorization 2 Copyright (C) 2003-2017 Free Software Foundation, Inc. 3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and 4 Ira Rosen <irar@il.ibm.com> 5 6 This file is part of GCC. 7 8 GCC is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free 10 Software Foundation; either version 3, or (at your option) any later 11 version. 12 13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or 15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 16 for more details. 17 18 You should have received a copy of the GNU General Public License 19 along with GCC; see the file COPYING3. If not see 20 <http://www.gnu.org/licenses/>. */ 21 22 #include "config.h" 23 #include "system.h" 24 #include "coretypes.h" 25 #include "backend.h" 26 #include "target.h" 27 #include "rtl.h" 28 #include "tree.h" 29 #include "gimple.h" 30 #include "cfghooks.h" 31 #include "tree-pass.h" 32 #include "ssa.h" 33 #include "optabs-tree.h" 34 #include "diagnostic-core.h" 35 #include "fold-const.h" 36 #include "stor-layout.h" 37 #include "cfganal.h" 38 #include "gimplify.h" 39 #include "gimple-iterator.h" 40 #include "gimplify-me.h" 41 #include "tree-ssa-loop-ivopts.h" 42 #include "tree-ssa-loop-manip.h" 43 #include "tree-ssa-loop-niter.h" 44 #include "tree-ssa-loop.h" 45 #include "cfgloop.h" 46 #include "params.h" 47 #include "tree-scalar-evolution.h" 48 #include "tree-vectorizer.h" 49 #include "gimple-fold.h" 50 #include "cgraph.h" 51 #include "tree-cfg.h" 52 #include "tree-if-conv.h" 53 #include "tree-eh.h" 54 55 /* Loop Vectorization Pass. 56 57 This pass tries to vectorize loops. 58 59 For example, the vectorizer transforms the following simple loop: 60 61 short a[N]; short b[N]; short c[N]; int i; 62 63 for (i=0; i<N; i++){ 64 a[i] = b[i] + c[i]; 65 } 66 67 as if it was manually vectorized by rewriting the source code into: 68 69 typedef int __attribute__((mode(V8HI))) v8hi; 70 short a[N]; short b[N]; short c[N]; int i; 71 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c; 72 v8hi va, vb, vc; 73 74 for (i=0; i<N/8; i++){ 75 vb = pb[i]; 76 vc = pc[i]; 77 va = vb + vc; 78 pa[i] = va; 79 } 80 81 The main entry to this pass is vectorize_loops(), in which 82 the vectorizer applies a set of analyses on a given set of loops, 83 followed by the actual vectorization transformation for the loops that 84 had successfully passed the analysis phase. 85 Throughout this pass we make a distinction between two types of 86 data: scalars (which are represented by SSA_NAMES), and memory references 87 ("data-refs"). These two types of data require different handling both 88 during analysis and transformation. The types of data-refs that the 89 vectorizer currently supports are ARRAY_REFS which base is an array DECL 90 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer 91 accesses are required to have a simple (consecutive) access pattern. 92 93 Analysis phase: 94 =============== 95 The driver for the analysis phase is vect_analyze_loop(). 96 It applies a set of analyses, some of which rely on the scalar evolution 97 analyzer (scev) developed by Sebastian Pop. 98 99 During the analysis phase the vectorizer records some information 100 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the 101 loop, as well as general information about the loop as a whole, which is 102 recorded in a "loop_vec_info" struct attached to each loop. 103 104 Transformation phase: 105 ===================== 106 The loop transformation phase scans all the stmts in the loop, and 107 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in 108 the loop that needs to be vectorized. It inserts the vector code sequence 109 just before the scalar stmt S, and records a pointer to the vector code 110 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct 111 attached to S). This pointer will be used for the vectorization of following 112 stmts which use the def of stmt S. Stmt S is removed if it writes to memory; 113 otherwise, we rely on dead code elimination for removing it. 114 115 For example, say stmt S1 was vectorized into stmt VS1: 116 117 VS1: vb = px[i]; 118 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1 119 S2: a = b; 120 121 To vectorize stmt S2, the vectorizer first finds the stmt that defines 122 the operand 'b' (S1), and gets the relevant vector def 'vb' from the 123 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The 124 resulting sequence would be: 125 126 VS1: vb = px[i]; 127 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1 128 VS2: va = vb; 129 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2 130 131 Operands that are not SSA_NAMEs, are data-refs that appear in 132 load/store operations (like 'x[i]' in S1), and are handled differently. 133 134 Target modeling: 135 ================= 136 Currently the only target specific information that is used is the 137 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". 138 Targets that can support different sizes of vectors, for now will need 139 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More 140 flexibility will be added in the future. 141 142 Since we only vectorize operations which vector form can be 143 expressed using existing tree codes, to verify that an operation is 144 supported, the vectorizer checks the relevant optab at the relevant 145 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If 146 the value found is CODE_FOR_nothing, then there's no target support, and 147 we can't vectorize the stmt. 148 149 For additional information on this project see: 150 http://gcc.gnu.org/projects/tree-ssa/vectorization.html 151 */ 152 153 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *); 154 155 /* Function vect_determine_vectorization_factor 156 157 Determine the vectorization factor (VF). VF is the number of data elements 158 that are operated upon in parallel in a single iteration of the vectorized 159 loop. For example, when vectorizing a loop that operates on 4byte elements, 160 on a target with vector size (VS) 16byte, the VF is set to 4, since 4 161 elements can fit in a single vector register. 162 163 We currently support vectorization of loops in which all types operated upon 164 are of the same size. Therefore this function currently sets VF according to 165 the size of the types operated upon, and fails if there are multiple sizes 166 in the loop. 167 168 VF is also the factor by which the loop iterations are strip-mined, e.g.: 169 original loop: 170 for (i=0; i<N; i++){ 171 a[i] = b[i] + c[i]; 172 } 173 174 vectorized loop: 175 for (i=0; i<N; i+=VF){ 176 a[i:VF] = b[i:VF] + c[i:VF]; 177 } 178 */ 179 180 static bool 181 vect_determine_vectorization_factor (loop_vec_info loop_vinfo) 182 { 183 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 184 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); 185 unsigned nbbs = loop->num_nodes; 186 unsigned int vectorization_factor = 0; 187 tree scalar_type = NULL_TREE; 188 gphi *phi; 189 tree vectype; 190 unsigned int nunits; 191 stmt_vec_info stmt_info; 192 unsigned i; 193 HOST_WIDE_INT dummy; 194 gimple *stmt, *pattern_stmt = NULL; 195 gimple_seq pattern_def_seq = NULL; 196 gimple_stmt_iterator pattern_def_si = gsi_none (); 197 bool analyze_pattern_stmt = false; 198 bool bool_result; 199 auto_vec<stmt_vec_info> mask_producers; 200 201 if (dump_enabled_p ()) 202 dump_printf_loc (MSG_NOTE, vect_location, 203 "=== vect_determine_vectorization_factor ===\n"); 204 205 for (i = 0; i < nbbs; i++) 206 { 207 basic_block bb = bbs[i]; 208 209 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si); 210 gsi_next (&si)) 211 { 212 phi = si.phi (); 213 stmt_info = vinfo_for_stmt (phi); 214 if (dump_enabled_p ()) 215 { 216 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: "); 217 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0); 218 } 219 220 gcc_assert (stmt_info); 221 222 if (STMT_VINFO_RELEVANT_P (stmt_info) 223 || STMT_VINFO_LIVE_P (stmt_info)) 224 { 225 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info)); 226 scalar_type = TREE_TYPE (PHI_RESULT (phi)); 227 228 if (dump_enabled_p ()) 229 { 230 dump_printf_loc (MSG_NOTE, vect_location, 231 "get vectype for scalar type: "); 232 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type); 233 dump_printf (MSG_NOTE, "\n"); 234 } 235 236 vectype = get_vectype_for_scalar_type (scalar_type); 237 if (!vectype) 238 { 239 if (dump_enabled_p ()) 240 { 241 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 242 "not vectorized: unsupported " 243 "data-type "); 244 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 245 scalar_type); 246 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 247 } 248 return false; 249 } 250 STMT_VINFO_VECTYPE (stmt_info) = vectype; 251 252 if (dump_enabled_p ()) 253 { 254 dump_printf_loc (MSG_NOTE, vect_location, "vectype: "); 255 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype); 256 dump_printf (MSG_NOTE, "\n"); 257 } 258 259 nunits = TYPE_VECTOR_SUBPARTS (vectype); 260 if (dump_enabled_p ()) 261 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", 262 nunits); 263 264 if (!vectorization_factor 265 || (nunits > vectorization_factor)) 266 vectorization_factor = nunits; 267 } 268 } 269 270 for (gimple_stmt_iterator si = gsi_start_bb (bb); 271 !gsi_end_p (si) || analyze_pattern_stmt;) 272 { 273 tree vf_vectype; 274 275 if (analyze_pattern_stmt) 276 stmt = pattern_stmt; 277 else 278 stmt = gsi_stmt (si); 279 280 stmt_info = vinfo_for_stmt (stmt); 281 282 if (dump_enabled_p ()) 283 { 284 dump_printf_loc (MSG_NOTE, vect_location, 285 "==> examining statement: "); 286 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0); 287 } 288 289 gcc_assert (stmt_info); 290 291 /* Skip stmts which do not need to be vectorized. */ 292 if ((!STMT_VINFO_RELEVANT_P (stmt_info) 293 && !STMT_VINFO_LIVE_P (stmt_info)) 294 || gimple_clobber_p (stmt)) 295 { 296 if (STMT_VINFO_IN_PATTERN_P (stmt_info) 297 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info)) 298 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt)) 299 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt)))) 300 { 301 stmt = pattern_stmt; 302 stmt_info = vinfo_for_stmt (pattern_stmt); 303 if (dump_enabled_p ()) 304 { 305 dump_printf_loc (MSG_NOTE, vect_location, 306 "==> examining pattern statement: "); 307 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0); 308 } 309 } 310 else 311 { 312 if (dump_enabled_p ()) 313 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n"); 314 gsi_next (&si); 315 continue; 316 } 317 } 318 else if (STMT_VINFO_IN_PATTERN_P (stmt_info) 319 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info)) 320 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt)) 321 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt)))) 322 analyze_pattern_stmt = true; 323 324 /* If a pattern statement has def stmts, analyze them too. */ 325 if (is_pattern_stmt_p (stmt_info)) 326 { 327 if (pattern_def_seq == NULL) 328 { 329 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info); 330 pattern_def_si = gsi_start (pattern_def_seq); 331 } 332 else if (!gsi_end_p (pattern_def_si)) 333 gsi_next (&pattern_def_si); 334 if (pattern_def_seq != NULL) 335 { 336 gimple *pattern_def_stmt = NULL; 337 stmt_vec_info pattern_def_stmt_info = NULL; 338 339 while (!gsi_end_p (pattern_def_si)) 340 { 341 pattern_def_stmt = gsi_stmt (pattern_def_si); 342 pattern_def_stmt_info 343 = vinfo_for_stmt (pattern_def_stmt); 344 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info) 345 || STMT_VINFO_LIVE_P (pattern_def_stmt_info)) 346 break; 347 gsi_next (&pattern_def_si); 348 } 349 350 if (!gsi_end_p (pattern_def_si)) 351 { 352 if (dump_enabled_p ()) 353 { 354 dump_printf_loc (MSG_NOTE, vect_location, 355 "==> examining pattern def stmt: "); 356 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, 357 pattern_def_stmt, 0); 358 } 359 360 stmt = pattern_def_stmt; 361 stmt_info = pattern_def_stmt_info; 362 } 363 else 364 { 365 pattern_def_si = gsi_none (); 366 analyze_pattern_stmt = false; 367 } 368 } 369 else 370 analyze_pattern_stmt = false; 371 } 372 373 if (gimple_get_lhs (stmt) == NULL_TREE 374 /* MASK_STORE has no lhs, but is ok. */ 375 && (!is_gimple_call (stmt) 376 || !gimple_call_internal_p (stmt) 377 || gimple_call_internal_fn (stmt) != IFN_MASK_STORE)) 378 { 379 if (is_gimple_call (stmt)) 380 { 381 /* Ignore calls with no lhs. These must be calls to 382 #pragma omp simd functions, and what vectorization factor 383 it really needs can't be determined until 384 vectorizable_simd_clone_call. */ 385 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si)) 386 { 387 pattern_def_seq = NULL; 388 gsi_next (&si); 389 } 390 continue; 391 } 392 if (dump_enabled_p ()) 393 { 394 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 395 "not vectorized: irregular stmt."); 396 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 397 0); 398 } 399 return false; 400 } 401 402 if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt)))) 403 { 404 if (dump_enabled_p ()) 405 { 406 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 407 "not vectorized: vector stmt in loop:"); 408 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 409 } 410 return false; 411 } 412 413 bool_result = false; 414 415 if (STMT_VINFO_VECTYPE (stmt_info)) 416 { 417 /* The only case when a vectype had been already set is for stmts 418 that contain a dataref, or for "pattern-stmts" (stmts 419 generated by the vectorizer to represent/replace a certain 420 idiom). */ 421 gcc_assert (STMT_VINFO_DATA_REF (stmt_info) 422 || is_pattern_stmt_p (stmt_info) 423 || !gsi_end_p (pattern_def_si)); 424 vectype = STMT_VINFO_VECTYPE (stmt_info); 425 } 426 else 427 { 428 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info)); 429 if (gimple_call_internal_p (stmt, IFN_MASK_STORE)) 430 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3)); 431 else 432 scalar_type = TREE_TYPE (gimple_get_lhs (stmt)); 433 434 /* Bool ops don't participate in vectorization factor 435 computation. For comparison use compared types to 436 compute a factor. */ 437 if (VECT_SCALAR_BOOLEAN_TYPE_P (scalar_type) 438 && is_gimple_assign (stmt) 439 && gimple_assign_rhs_code (stmt) != COND_EXPR) 440 { 441 if (STMT_VINFO_RELEVANT_P (stmt_info) 442 || STMT_VINFO_LIVE_P (stmt_info)) 443 mask_producers.safe_push (stmt_info); 444 bool_result = true; 445 446 if (TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) 447 == tcc_comparison 448 && !VECT_SCALAR_BOOLEAN_TYPE_P 449 (TREE_TYPE (gimple_assign_rhs1 (stmt)))) 450 scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt)); 451 else 452 { 453 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si)) 454 { 455 pattern_def_seq = NULL; 456 gsi_next (&si); 457 } 458 continue; 459 } 460 } 461 462 if (dump_enabled_p ()) 463 { 464 dump_printf_loc (MSG_NOTE, vect_location, 465 "get vectype for scalar type: "); 466 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type); 467 dump_printf (MSG_NOTE, "\n"); 468 } 469 vectype = get_vectype_for_scalar_type (scalar_type); 470 if (!vectype) 471 { 472 if (dump_enabled_p ()) 473 { 474 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 475 "not vectorized: unsupported " 476 "data-type "); 477 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 478 scalar_type); 479 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 480 } 481 return false; 482 } 483 484 if (!bool_result) 485 STMT_VINFO_VECTYPE (stmt_info) = vectype; 486 487 if (dump_enabled_p ()) 488 { 489 dump_printf_loc (MSG_NOTE, vect_location, "vectype: "); 490 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype); 491 dump_printf (MSG_NOTE, "\n"); 492 } 493 } 494 495 /* Don't try to compute VF out scalar types if we stmt 496 produces boolean vector. Use result vectype instead. */ 497 if (VECTOR_BOOLEAN_TYPE_P (vectype)) 498 vf_vectype = vectype; 499 else 500 { 501 /* The vectorization factor is according to the smallest 502 scalar type (or the largest vector size, but we only 503 support one vector size per loop). */ 504 if (!bool_result) 505 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy, 506 &dummy); 507 if (dump_enabled_p ()) 508 { 509 dump_printf_loc (MSG_NOTE, vect_location, 510 "get vectype for scalar type: "); 511 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type); 512 dump_printf (MSG_NOTE, "\n"); 513 } 514 vf_vectype = get_vectype_for_scalar_type (scalar_type); 515 } 516 if (!vf_vectype) 517 { 518 if (dump_enabled_p ()) 519 { 520 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 521 "not vectorized: unsupported data-type "); 522 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 523 scalar_type); 524 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 525 } 526 return false; 527 } 528 529 if ((GET_MODE_SIZE (TYPE_MODE (vectype)) 530 != GET_MODE_SIZE (TYPE_MODE (vf_vectype)))) 531 { 532 if (dump_enabled_p ()) 533 { 534 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 535 "not vectorized: different sized vector " 536 "types in statement, "); 537 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 538 vectype); 539 dump_printf (MSG_MISSED_OPTIMIZATION, " and "); 540 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 541 vf_vectype); 542 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 543 } 544 return false; 545 } 546 547 if (dump_enabled_p ()) 548 { 549 dump_printf_loc (MSG_NOTE, vect_location, "vectype: "); 550 dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype); 551 dump_printf (MSG_NOTE, "\n"); 552 } 553 554 nunits = TYPE_VECTOR_SUBPARTS (vf_vectype); 555 if (dump_enabled_p ()) 556 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits); 557 if (!vectorization_factor 558 || (nunits > vectorization_factor)) 559 vectorization_factor = nunits; 560 561 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si)) 562 { 563 pattern_def_seq = NULL; 564 gsi_next (&si); 565 } 566 } 567 } 568 569 /* TODO: Analyze cost. Decide if worth while to vectorize. */ 570 if (dump_enabled_p ()) 571 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n", 572 vectorization_factor); 573 if (vectorization_factor <= 1) 574 { 575 if (dump_enabled_p ()) 576 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 577 "not vectorized: unsupported data-type\n"); 578 return false; 579 } 580 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor; 581 582 for (i = 0; i < mask_producers.length (); i++) 583 { 584 tree mask_type = NULL; 585 586 stmt = STMT_VINFO_STMT (mask_producers[i]); 587 588 if (is_gimple_assign (stmt) 589 && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison 590 && !VECT_SCALAR_BOOLEAN_TYPE_P 591 (TREE_TYPE (gimple_assign_rhs1 (stmt)))) 592 { 593 scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt)); 594 mask_type = get_mask_type_for_scalar_type (scalar_type); 595 596 if (!mask_type) 597 { 598 if (dump_enabled_p ()) 599 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 600 "not vectorized: unsupported mask\n"); 601 return false; 602 } 603 } 604 else 605 { 606 tree rhs; 607 ssa_op_iter iter; 608 gimple *def_stmt; 609 enum vect_def_type dt; 610 611 FOR_EACH_SSA_TREE_OPERAND (rhs, stmt, iter, SSA_OP_USE) 612 { 613 if (!vect_is_simple_use (rhs, mask_producers[i]->vinfo, 614 &def_stmt, &dt, &vectype)) 615 { 616 if (dump_enabled_p ()) 617 { 618 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 619 "not vectorized: can't compute mask type " 620 "for statement, "); 621 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 622 0); 623 } 624 return false; 625 } 626 627 /* No vectype probably means external definition. 628 Allow it in case there is another operand which 629 allows to determine mask type. */ 630 if (!vectype) 631 continue; 632 633 if (!mask_type) 634 mask_type = vectype; 635 else if (TYPE_VECTOR_SUBPARTS (mask_type) 636 != TYPE_VECTOR_SUBPARTS (vectype)) 637 { 638 if (dump_enabled_p ()) 639 { 640 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 641 "not vectorized: different sized masks " 642 "types in statement, "); 643 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 644 mask_type); 645 dump_printf (MSG_MISSED_OPTIMIZATION, " and "); 646 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 647 vectype); 648 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 649 } 650 return false; 651 } 652 else if (VECTOR_BOOLEAN_TYPE_P (mask_type) 653 != VECTOR_BOOLEAN_TYPE_P (vectype)) 654 { 655 if (dump_enabled_p ()) 656 { 657 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 658 "not vectorized: mixed mask and " 659 "nonmask vector types in statement, "); 660 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 661 mask_type); 662 dump_printf (MSG_MISSED_OPTIMIZATION, " and "); 663 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 664 vectype); 665 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 666 } 667 return false; 668 } 669 } 670 671 /* We may compare boolean value loaded as vector of integers. 672 Fix mask_type in such case. */ 673 if (mask_type 674 && !VECTOR_BOOLEAN_TYPE_P (mask_type) 675 && gimple_code (stmt) == GIMPLE_ASSIGN 676 && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison) 677 mask_type = build_same_sized_truth_vector_type (mask_type); 678 } 679 680 /* No mask_type should mean loop invariant predicate. 681 This is probably a subject for optimization in 682 if-conversion. */ 683 if (!mask_type) 684 { 685 if (dump_enabled_p ()) 686 { 687 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 688 "not vectorized: can't compute mask type " 689 "for statement, "); 690 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 691 0); 692 } 693 return false; 694 } 695 696 STMT_VINFO_VECTYPE (mask_producers[i]) = mask_type; 697 } 698 699 return true; 700 } 701 702 703 /* Function vect_is_simple_iv_evolution. 704 705 FORNOW: A simple evolution of an induction variables in the loop is 706 considered a polynomial evolution. */ 707 708 static bool 709 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init, 710 tree * step) 711 { 712 tree init_expr; 713 tree step_expr; 714 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb); 715 basic_block bb; 716 717 /* When there is no evolution in this loop, the evolution function 718 is not "simple". */ 719 if (evolution_part == NULL_TREE) 720 return false; 721 722 /* When the evolution is a polynomial of degree >= 2 723 the evolution function is not "simple". */ 724 if (tree_is_chrec (evolution_part)) 725 return false; 726 727 step_expr = evolution_part; 728 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb)); 729 730 if (dump_enabled_p ()) 731 { 732 dump_printf_loc (MSG_NOTE, vect_location, "step: "); 733 dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr); 734 dump_printf (MSG_NOTE, ", init: "); 735 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr); 736 dump_printf (MSG_NOTE, "\n"); 737 } 738 739 *init = init_expr; 740 *step = step_expr; 741 742 if (TREE_CODE (step_expr) != INTEGER_CST 743 && (TREE_CODE (step_expr) != SSA_NAME 744 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr))) 745 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb)) 746 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr)) 747 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)) 748 || !flag_associative_math))) 749 && (TREE_CODE (step_expr) != REAL_CST 750 || !flag_associative_math)) 751 { 752 if (dump_enabled_p ()) 753 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 754 "step unknown.\n"); 755 return false; 756 } 757 758 return true; 759 } 760 761 /* Function vect_analyze_scalar_cycles_1. 762 763 Examine the cross iteration def-use cycles of scalar variables 764 in LOOP. LOOP_VINFO represents the loop that is now being 765 considered for vectorization (can be LOOP, or an outer-loop 766 enclosing LOOP). */ 767 768 static void 769 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop) 770 { 771 basic_block bb = loop->header; 772 tree init, step; 773 auto_vec<gimple *, 64> worklist; 774 gphi_iterator gsi; 775 bool double_reduc; 776 777 if (dump_enabled_p ()) 778 dump_printf_loc (MSG_NOTE, vect_location, 779 "=== vect_analyze_scalar_cycles ===\n"); 780 781 /* First - identify all inductions. Reduction detection assumes that all the 782 inductions have been identified, therefore, this order must not be 783 changed. */ 784 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi)) 785 { 786 gphi *phi = gsi.phi (); 787 tree access_fn = NULL; 788 tree def = PHI_RESULT (phi); 789 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi); 790 791 if (dump_enabled_p ()) 792 { 793 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: "); 794 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0); 795 } 796 797 /* Skip virtual phi's. The data dependences that are associated with 798 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */ 799 if (virtual_operand_p (def)) 800 continue; 801 802 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type; 803 804 /* Analyze the evolution function. */ 805 access_fn = analyze_scalar_evolution (loop, def); 806 if (access_fn) 807 { 808 STRIP_NOPS (access_fn); 809 if (dump_enabled_p ()) 810 { 811 dump_printf_loc (MSG_NOTE, vect_location, 812 "Access function of PHI: "); 813 dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn); 814 dump_printf (MSG_NOTE, "\n"); 815 } 816 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo) 817 = initial_condition_in_loop_num (access_fn, loop->num); 818 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) 819 = evolution_part_in_loop_num (access_fn, loop->num); 820 } 821 822 if (!access_fn 823 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step) 824 || (LOOP_VINFO_LOOP (loop_vinfo) != loop 825 && TREE_CODE (step) != INTEGER_CST)) 826 { 827 worklist.safe_push (phi); 828 continue; 829 } 830 831 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo) 832 != NULL_TREE); 833 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE); 834 835 if (dump_enabled_p ()) 836 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n"); 837 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def; 838 } 839 840 841 /* Second - identify all reductions and nested cycles. */ 842 while (worklist.length () > 0) 843 { 844 gimple *phi = worklist.pop (); 845 tree def = PHI_RESULT (phi); 846 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi); 847 gimple *reduc_stmt; 848 bool nested_cycle; 849 850 if (dump_enabled_p ()) 851 { 852 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: "); 853 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0); 854 } 855 856 gcc_assert (!virtual_operand_p (def) 857 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type); 858 859 nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo)); 860 reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle, 861 &double_reduc, false); 862 if (reduc_stmt) 863 { 864 if (double_reduc) 865 { 866 if (dump_enabled_p ()) 867 dump_printf_loc (MSG_NOTE, vect_location, 868 "Detected double reduction.\n"); 869 870 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def; 871 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) = 872 vect_double_reduction_def; 873 } 874 else 875 { 876 if (nested_cycle) 877 { 878 if (dump_enabled_p ()) 879 dump_printf_loc (MSG_NOTE, vect_location, 880 "Detected vectorizable nested cycle.\n"); 881 882 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle; 883 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) = 884 vect_nested_cycle; 885 } 886 else 887 { 888 if (dump_enabled_p ()) 889 dump_printf_loc (MSG_NOTE, vect_location, 890 "Detected reduction.\n"); 891 892 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def; 893 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) = 894 vect_reduction_def; 895 /* Store the reduction cycles for possible vectorization in 896 loop-aware SLP. */ 897 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt); 898 } 899 } 900 } 901 else 902 if (dump_enabled_p ()) 903 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 904 "Unknown def-use cycle pattern.\n"); 905 } 906 } 907 908 909 /* Function vect_analyze_scalar_cycles. 910 911 Examine the cross iteration def-use cycles of scalar variables, by 912 analyzing the loop-header PHIs of scalar variables. Classify each 913 cycle as one of the following: invariant, induction, reduction, unknown. 914 We do that for the loop represented by LOOP_VINFO, and also to its 915 inner-loop, if exists. 916 Examples for scalar cycles: 917 918 Example1: reduction: 919 920 loop1: 921 for (i=0; i<N; i++) 922 sum += a[i]; 923 924 Example2: induction: 925 926 loop2: 927 for (i=0; i<N; i++) 928 a[i] = i; */ 929 930 static void 931 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo) 932 { 933 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 934 935 vect_analyze_scalar_cycles_1 (loop_vinfo, loop); 936 937 /* When vectorizing an outer-loop, the inner-loop is executed sequentially. 938 Reductions in such inner-loop therefore have different properties than 939 the reductions in the nest that gets vectorized: 940 1. When vectorized, they are executed in the same order as in the original 941 scalar loop, so we can't change the order of computation when 942 vectorizing them. 943 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the 944 current checks are too strict. */ 945 946 if (loop->inner) 947 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner); 948 } 949 950 /* Transfer group and reduction information from STMT to its pattern stmt. */ 951 952 static void 953 vect_fixup_reduc_chain (gimple *stmt) 954 { 955 gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt)); 956 gimple *stmtp; 957 gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp)) 958 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))); 959 GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt)); 960 do 961 { 962 stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt)); 963 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp; 964 stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt)); 965 if (stmt) 966 GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp)) 967 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt)); 968 } 969 while (stmt); 970 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def; 971 } 972 973 /* Fixup scalar cycles that now have their stmts detected as patterns. */ 974 975 static void 976 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo) 977 { 978 gimple *first; 979 unsigned i; 980 981 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first) 982 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first))) 983 { 984 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first)); 985 while (next) 986 { 987 if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next))) 988 break; 989 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next)); 990 } 991 /* If not all stmt in the chain are patterns try to handle 992 the chain without patterns. */ 993 if (! next) 994 { 995 vect_fixup_reduc_chain (first); 996 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i] 997 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first)); 998 } 999 } 1000 } 1001 1002 /* Function vect_get_loop_niters. 1003 1004 Determine how many iterations the loop is executed and place it 1005 in NUMBER_OF_ITERATIONS. Place the number of latch iterations 1006 in NUMBER_OF_ITERATIONSM1. Place the condition under which the 1007 niter information holds in ASSUMPTIONS. 1008 1009 Return the loop exit condition. */ 1010 1011 1012 static gcond * 1013 vect_get_loop_niters (struct loop *loop, tree *assumptions, 1014 tree *number_of_iterations, tree *number_of_iterationsm1) 1015 { 1016 edge exit = single_exit (loop); 1017 struct tree_niter_desc niter_desc; 1018 tree niter_assumptions, niter, may_be_zero; 1019 gcond *cond = get_loop_exit_condition (loop); 1020 1021 *assumptions = boolean_true_node; 1022 *number_of_iterationsm1 = chrec_dont_know; 1023 *number_of_iterations = chrec_dont_know; 1024 if (dump_enabled_p ()) 1025 dump_printf_loc (MSG_NOTE, vect_location, 1026 "=== get_loop_niters ===\n"); 1027 1028 if (!exit) 1029 return cond; 1030 1031 niter = chrec_dont_know; 1032 may_be_zero = NULL_TREE; 1033 niter_assumptions = boolean_true_node; 1034 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL) 1035 || chrec_contains_undetermined (niter_desc.niter)) 1036 return cond; 1037 1038 niter_assumptions = niter_desc.assumptions; 1039 may_be_zero = niter_desc.may_be_zero; 1040 niter = niter_desc.niter; 1041 1042 if (may_be_zero && integer_zerop (may_be_zero)) 1043 may_be_zero = NULL_TREE; 1044 1045 if (may_be_zero) 1046 { 1047 if (COMPARISON_CLASS_P (may_be_zero)) 1048 { 1049 /* Try to combine may_be_zero with assumptions, this can simplify 1050 computation of niter expression. */ 1051 if (niter_assumptions && !integer_nonzerop (niter_assumptions)) 1052 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node, 1053 niter_assumptions, 1054 fold_build1 (TRUTH_NOT_EXPR, 1055 boolean_type_node, 1056 may_be_zero)); 1057 else 1058 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero, 1059 build_int_cst (TREE_TYPE (niter), 0), 1060 rewrite_to_non_trapping_overflow (niter)); 1061 1062 may_be_zero = NULL_TREE; 1063 } 1064 else if (integer_nonzerop (may_be_zero)) 1065 { 1066 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0); 1067 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1); 1068 return cond; 1069 } 1070 else 1071 return cond; 1072 } 1073 1074 *assumptions = niter_assumptions; 1075 *number_of_iterationsm1 = niter; 1076 1077 /* We want the number of loop header executions which is the number 1078 of latch executions plus one. 1079 ??? For UINT_MAX latch executions this number overflows to zero 1080 for loops like do { n++; } while (n != 0); */ 1081 if (niter && !chrec_contains_undetermined (niter)) 1082 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter), 1083 build_int_cst (TREE_TYPE (niter), 1)); 1084 *number_of_iterations = niter; 1085 1086 return cond; 1087 } 1088 1089 /* Function bb_in_loop_p 1090 1091 Used as predicate for dfs order traversal of the loop bbs. */ 1092 1093 static bool 1094 bb_in_loop_p (const_basic_block bb, const void *data) 1095 { 1096 const struct loop *const loop = (const struct loop *)data; 1097 if (flow_bb_inside_loop_p (loop, bb)) 1098 return true; 1099 return false; 1100 } 1101 1102 1103 /* Function new_loop_vec_info. 1104 1105 Create and initialize a new loop_vec_info struct for LOOP, as well as 1106 stmt_vec_info structs for all the stmts in LOOP. */ 1107 1108 static loop_vec_info 1109 new_loop_vec_info (struct loop *loop) 1110 { 1111 loop_vec_info res; 1112 basic_block *bbs; 1113 gimple_stmt_iterator si; 1114 unsigned int i, nbbs; 1115 1116 res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info)); 1117 res->kind = vec_info::loop; 1118 LOOP_VINFO_LOOP (res) = loop; 1119 1120 bbs = get_loop_body (loop); 1121 1122 /* Create/Update stmt_info for all stmts in the loop. */ 1123 for (i = 0; i < loop->num_nodes; i++) 1124 { 1125 basic_block bb = bbs[i]; 1126 1127 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si)) 1128 { 1129 gimple *phi = gsi_stmt (si); 1130 gimple_set_uid (phi, 0); 1131 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res)); 1132 } 1133 1134 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si)) 1135 { 1136 gimple *stmt = gsi_stmt (si); 1137 gimple_set_uid (stmt, 0); 1138 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res)); 1139 } 1140 } 1141 1142 /* CHECKME: We want to visit all BBs before their successors (except for 1143 latch blocks, for which this assertion wouldn't hold). In the simple 1144 case of the loop forms we allow, a dfs order of the BBs would the same 1145 as reversed postorder traversal, so we are safe. */ 1146 1147 free (bbs); 1148 bbs = XCNEWVEC (basic_block, loop->num_nodes); 1149 nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p, 1150 bbs, loop->num_nodes, loop); 1151 gcc_assert (nbbs == loop->num_nodes); 1152 1153 LOOP_VINFO_BBS (res) = bbs; 1154 LOOP_VINFO_NITERSM1 (res) = NULL; 1155 LOOP_VINFO_NITERS (res) = NULL; 1156 LOOP_VINFO_NITERS_UNCHANGED (res) = NULL; 1157 LOOP_VINFO_NITERS_ASSUMPTIONS (res) = NULL; 1158 LOOP_VINFO_COST_MODEL_THRESHOLD (res) = 0; 1159 LOOP_VINFO_VECTORIZABLE_P (res) = 0; 1160 LOOP_VINFO_PEELING_FOR_ALIGNMENT (res) = 0; 1161 LOOP_VINFO_VECT_FACTOR (res) = 0; 1162 LOOP_VINFO_LOOP_NEST (res) = vNULL; 1163 LOOP_VINFO_DATAREFS (res) = vNULL; 1164 LOOP_VINFO_DDRS (res) = vNULL; 1165 LOOP_VINFO_UNALIGNED_DR (res) = NULL; 1166 LOOP_VINFO_MAY_MISALIGN_STMTS (res) = vNULL; 1167 LOOP_VINFO_MAY_ALIAS_DDRS (res) = vNULL; 1168 LOOP_VINFO_GROUPED_STORES (res) = vNULL; 1169 LOOP_VINFO_REDUCTIONS (res) = vNULL; 1170 LOOP_VINFO_REDUCTION_CHAINS (res) = vNULL; 1171 LOOP_VINFO_SLP_INSTANCES (res) = vNULL; 1172 LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1; 1173 LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop); 1174 LOOP_VINFO_PEELING_FOR_GAPS (res) = false; 1175 LOOP_VINFO_PEELING_FOR_NITER (res) = false; 1176 LOOP_VINFO_OPERANDS_SWAPPED (res) = false; 1177 LOOP_VINFO_ORIG_LOOP_INFO (res) = NULL; 1178 1179 return res; 1180 } 1181 1182 1183 /* Function destroy_loop_vec_info. 1184 1185 Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the 1186 stmts in the loop. */ 1187 1188 void 1189 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts) 1190 { 1191 struct loop *loop; 1192 basic_block *bbs; 1193 int nbbs; 1194 gimple_stmt_iterator si; 1195 int j; 1196 vec<slp_instance> slp_instances; 1197 slp_instance instance; 1198 bool swapped; 1199 1200 if (!loop_vinfo) 1201 return; 1202 1203 loop = LOOP_VINFO_LOOP (loop_vinfo); 1204 1205 bbs = LOOP_VINFO_BBS (loop_vinfo); 1206 nbbs = clean_stmts ? loop->num_nodes : 0; 1207 swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo); 1208 1209 for (j = 0; j < nbbs; j++) 1210 { 1211 basic_block bb = bbs[j]; 1212 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si)) 1213 free_stmt_vec_info (gsi_stmt (si)); 1214 1215 for (si = gsi_start_bb (bb); !gsi_end_p (si); ) 1216 { 1217 gimple *stmt = gsi_stmt (si); 1218 1219 /* We may have broken canonical form by moving a constant 1220 into RHS1 of a commutative op. Fix such occurrences. */ 1221 if (swapped && is_gimple_assign (stmt)) 1222 { 1223 enum tree_code code = gimple_assign_rhs_code (stmt); 1224 1225 if ((code == PLUS_EXPR 1226 || code == POINTER_PLUS_EXPR 1227 || code == MULT_EXPR) 1228 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt))) 1229 swap_ssa_operands (stmt, 1230 gimple_assign_rhs1_ptr (stmt), 1231 gimple_assign_rhs2_ptr (stmt)); 1232 else if (code == COND_EXPR 1233 && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt))) 1234 { 1235 tree cond_expr = gimple_assign_rhs1 (stmt); 1236 enum tree_code cond_code = TREE_CODE (cond_expr); 1237 1238 if (TREE_CODE_CLASS (cond_code) == tcc_comparison) 1239 { 1240 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 1241 0)); 1242 cond_code = invert_tree_comparison (cond_code, 1243 honor_nans); 1244 if (cond_code != ERROR_MARK) 1245 { 1246 TREE_SET_CODE (cond_expr, cond_code); 1247 swap_ssa_operands (stmt, 1248 gimple_assign_rhs2_ptr (stmt), 1249 gimple_assign_rhs3_ptr (stmt)); 1250 } 1251 } 1252 } 1253 } 1254 1255 /* Free stmt_vec_info. */ 1256 free_stmt_vec_info (stmt); 1257 gsi_next (&si); 1258 } 1259 } 1260 1261 free (LOOP_VINFO_BBS (loop_vinfo)); 1262 vect_destroy_datarefs (loop_vinfo); 1263 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo)); 1264 LOOP_VINFO_LOOP_NEST (loop_vinfo).release (); 1265 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release (); 1266 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release (); 1267 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release (); 1268 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo); 1269 FOR_EACH_VEC_ELT (slp_instances, j, instance) 1270 vect_free_slp_instance (instance); 1271 1272 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release (); 1273 LOOP_VINFO_GROUPED_STORES (loop_vinfo).release (); 1274 LOOP_VINFO_REDUCTIONS (loop_vinfo).release (); 1275 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release (); 1276 1277 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)); 1278 loop_vinfo->scalar_cost_vec.release (); 1279 1280 free (loop_vinfo); 1281 loop->aux = NULL; 1282 } 1283 1284 1285 /* Calculate the cost of one scalar iteration of the loop. */ 1286 static void 1287 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo) 1288 { 1289 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 1290 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); 1291 int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0; 1292 int innerloop_iters, i; 1293 1294 /* Count statements in scalar loop. Using this as scalar cost for a single 1295 iteration for now. 1296 1297 TODO: Add outer loop support. 1298 1299 TODO: Consider assigning different costs to different scalar 1300 statements. */ 1301 1302 /* FORNOW. */ 1303 innerloop_iters = 1; 1304 if (loop->inner) 1305 innerloop_iters = 50; /* FIXME */ 1306 1307 for (i = 0; i < nbbs; i++) 1308 { 1309 gimple_stmt_iterator si; 1310 basic_block bb = bbs[i]; 1311 1312 if (bb->loop_father == loop->inner) 1313 factor = innerloop_iters; 1314 else 1315 factor = 1; 1316 1317 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si)) 1318 { 1319 gimple *stmt = gsi_stmt (si); 1320 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 1321 1322 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt)) 1323 continue; 1324 1325 /* Skip stmts that are not vectorized inside the loop. */ 1326 if (stmt_info 1327 && !STMT_VINFO_RELEVANT_P (stmt_info) 1328 && (!STMT_VINFO_LIVE_P (stmt_info) 1329 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info))) 1330 && !STMT_VINFO_IN_PATTERN_P (stmt_info)) 1331 continue; 1332 1333 vect_cost_for_stmt kind; 1334 if (STMT_VINFO_DATA_REF (stmt_info)) 1335 { 1336 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))) 1337 kind = scalar_load; 1338 else 1339 kind = scalar_store; 1340 } 1341 else 1342 kind = scalar_stmt; 1343 1344 scalar_single_iter_cost 1345 += record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), 1346 factor, kind, stmt_info, 0, vect_prologue); 1347 } 1348 } 1349 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) 1350 = scalar_single_iter_cost; 1351 } 1352 1353 1354 /* Function vect_analyze_loop_form_1. 1355 1356 Verify that certain CFG restrictions hold, including: 1357 - the loop has a pre-header 1358 - the loop has a single entry and exit 1359 - the loop exit condition is simple enough 1360 - the number of iterations can be analyzed, i.e, a countable loop. The 1361 niter could be analyzed under some assumptions. */ 1362 1363 bool 1364 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond, 1365 tree *assumptions, tree *number_of_iterationsm1, 1366 tree *number_of_iterations, gcond **inner_loop_cond) 1367 { 1368 if (dump_enabled_p ()) 1369 dump_printf_loc (MSG_NOTE, vect_location, 1370 "=== vect_analyze_loop_form ===\n"); 1371 1372 /* Different restrictions apply when we are considering an inner-most loop, 1373 vs. an outer (nested) loop. 1374 (FORNOW. May want to relax some of these restrictions in the future). */ 1375 1376 if (!loop->inner) 1377 { 1378 /* Inner-most loop. We currently require that the number of BBs is 1379 exactly 2 (the header and latch). Vectorizable inner-most loops 1380 look like this: 1381 1382 (pre-header) 1383 | 1384 header <--------+ 1385 | | | 1386 | +--> latch --+ 1387 | 1388 (exit-bb) */ 1389 1390 if (loop->num_nodes != 2) 1391 { 1392 if (dump_enabled_p ()) 1393 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1394 "not vectorized: control flow in loop.\n"); 1395 return false; 1396 } 1397 1398 if (empty_block_p (loop->header)) 1399 { 1400 if (dump_enabled_p ()) 1401 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1402 "not vectorized: empty loop.\n"); 1403 return false; 1404 } 1405 } 1406 else 1407 { 1408 struct loop *innerloop = loop->inner; 1409 edge entryedge; 1410 1411 /* Nested loop. We currently require that the loop is doubly-nested, 1412 contains a single inner loop, and the number of BBs is exactly 5. 1413 Vectorizable outer-loops look like this: 1414 1415 (pre-header) 1416 | 1417 header <---+ 1418 | | 1419 inner-loop | 1420 | | 1421 tail ------+ 1422 | 1423 (exit-bb) 1424 1425 The inner-loop has the properties expected of inner-most loops 1426 as described above. */ 1427 1428 if ((loop->inner)->inner || (loop->inner)->next) 1429 { 1430 if (dump_enabled_p ()) 1431 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1432 "not vectorized: multiple nested loops.\n"); 1433 return false; 1434 } 1435 1436 if (loop->num_nodes != 5) 1437 { 1438 if (dump_enabled_p ()) 1439 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1440 "not vectorized: control flow in loop.\n"); 1441 return false; 1442 } 1443 1444 entryedge = loop_preheader_edge (innerloop); 1445 if (entryedge->src != loop->header 1446 || !single_exit (innerloop) 1447 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src) 1448 { 1449 if (dump_enabled_p ()) 1450 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1451 "not vectorized: unsupported outerloop form.\n"); 1452 return false; 1453 } 1454 1455 /* Analyze the inner-loop. */ 1456 tree inner_niterm1, inner_niter, inner_assumptions; 1457 if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond, 1458 &inner_assumptions, &inner_niterm1, 1459 &inner_niter, NULL) 1460 /* Don't support analyzing niter under assumptions for inner 1461 loop. */ 1462 || !integer_onep (inner_assumptions)) 1463 { 1464 if (dump_enabled_p ()) 1465 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1466 "not vectorized: Bad inner loop.\n"); 1467 return false; 1468 } 1469 1470 if (!expr_invariant_in_loop_p (loop, inner_niter)) 1471 { 1472 if (dump_enabled_p ()) 1473 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1474 "not vectorized: inner-loop count not" 1475 " invariant.\n"); 1476 return false; 1477 } 1478 1479 if (dump_enabled_p ()) 1480 dump_printf_loc (MSG_NOTE, vect_location, 1481 "Considering outer-loop vectorization.\n"); 1482 } 1483 1484 if (!single_exit (loop) 1485 || EDGE_COUNT (loop->header->preds) != 2) 1486 { 1487 if (dump_enabled_p ()) 1488 { 1489 if (!single_exit (loop)) 1490 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1491 "not vectorized: multiple exits.\n"); 1492 else if (EDGE_COUNT (loop->header->preds) != 2) 1493 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1494 "not vectorized: too many incoming edges.\n"); 1495 } 1496 return false; 1497 } 1498 1499 /* We assume that the loop exit condition is at the end of the loop. i.e, 1500 that the loop is represented as a do-while (with a proper if-guard 1501 before the loop if needed), where the loop header contains all the 1502 executable statements, and the latch is empty. */ 1503 if (!empty_block_p (loop->latch) 1504 || !gimple_seq_empty_p (phi_nodes (loop->latch))) 1505 { 1506 if (dump_enabled_p ()) 1507 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1508 "not vectorized: latch block not empty.\n"); 1509 return false; 1510 } 1511 1512 /* Make sure the exit is not abnormal. */ 1513 edge e = single_exit (loop); 1514 if (e->flags & EDGE_ABNORMAL) 1515 { 1516 if (dump_enabled_p ()) 1517 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1518 "not vectorized: abnormal loop exit edge.\n"); 1519 return false; 1520 } 1521 1522 *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations, 1523 number_of_iterationsm1); 1524 if (!*loop_cond) 1525 { 1526 if (dump_enabled_p ()) 1527 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1528 "not vectorized: complicated exit condition.\n"); 1529 return false; 1530 } 1531 1532 if (integer_zerop (*assumptions) 1533 || !*number_of_iterations 1534 || chrec_contains_undetermined (*number_of_iterations)) 1535 { 1536 if (dump_enabled_p ()) 1537 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1538 "not vectorized: number of iterations cannot be " 1539 "computed.\n"); 1540 return false; 1541 } 1542 1543 if (integer_zerop (*number_of_iterations)) 1544 { 1545 if (dump_enabled_p ()) 1546 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1547 "not vectorized: number of iterations = 0.\n"); 1548 return false; 1549 } 1550 1551 return true; 1552 } 1553 1554 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */ 1555 1556 loop_vec_info 1557 vect_analyze_loop_form (struct loop *loop) 1558 { 1559 tree assumptions, number_of_iterations, number_of_iterationsm1; 1560 gcond *loop_cond, *inner_loop_cond = NULL; 1561 1562 if (! vect_analyze_loop_form_1 (loop, &loop_cond, 1563 &assumptions, &number_of_iterationsm1, 1564 &number_of_iterations, &inner_loop_cond)) 1565 return NULL; 1566 1567 loop_vec_info loop_vinfo = new_loop_vec_info (loop); 1568 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1; 1569 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations; 1570 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations; 1571 if (!integer_onep (assumptions)) 1572 { 1573 /* We consider to vectorize this loop by versioning it under 1574 some assumptions. In order to do this, we need to clear 1575 existing information computed by scev and niter analyzer. */ 1576 scev_reset_htab (); 1577 free_numbers_of_iterations_estimates_loop (loop); 1578 /* Also set flag for this loop so that following scev and niter 1579 analysis are done under the assumptions. */ 1580 loop_constraint_set (loop, LOOP_C_FINITE); 1581 /* Also record the assumptions for versioning. */ 1582 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions; 1583 } 1584 1585 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) 1586 { 1587 if (dump_enabled_p ()) 1588 { 1589 dump_printf_loc (MSG_NOTE, vect_location, 1590 "Symbolic number of iterations is "); 1591 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations); 1592 dump_printf (MSG_NOTE, "\n"); 1593 } 1594 } 1595 1596 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type; 1597 if (inner_loop_cond) 1598 STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond)) 1599 = loop_exit_ctrl_vec_info_type; 1600 1601 gcc_assert (!loop->aux); 1602 loop->aux = loop_vinfo; 1603 return loop_vinfo; 1604 } 1605 1606 1607 1608 /* Scan the loop stmts and dependent on whether there are any (non-)SLP 1609 statements update the vectorization factor. */ 1610 1611 static void 1612 vect_update_vf_for_slp (loop_vec_info loop_vinfo) 1613 { 1614 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 1615 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); 1616 int nbbs = loop->num_nodes; 1617 unsigned int vectorization_factor; 1618 int i; 1619 1620 if (dump_enabled_p ()) 1621 dump_printf_loc (MSG_NOTE, vect_location, 1622 "=== vect_update_vf_for_slp ===\n"); 1623 1624 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 1625 gcc_assert (vectorization_factor != 0); 1626 1627 /* If all the stmts in the loop can be SLPed, we perform only SLP, and 1628 vectorization factor of the loop is the unrolling factor required by 1629 the SLP instances. If that unrolling factor is 1, we say, that we 1630 perform pure SLP on loop - cross iteration parallelism is not 1631 exploited. */ 1632 bool only_slp_in_loop = true; 1633 for (i = 0; i < nbbs; i++) 1634 { 1635 basic_block bb = bbs[i]; 1636 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si); 1637 gsi_next (&si)) 1638 { 1639 gimple *stmt = gsi_stmt (si); 1640 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 1641 if (STMT_VINFO_IN_PATTERN_P (stmt_info) 1642 && STMT_VINFO_RELATED_STMT (stmt_info)) 1643 { 1644 stmt = STMT_VINFO_RELATED_STMT (stmt_info); 1645 stmt_info = vinfo_for_stmt (stmt); 1646 } 1647 if ((STMT_VINFO_RELEVANT_P (stmt_info) 1648 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info))) 1649 && !PURE_SLP_STMT (stmt_info)) 1650 /* STMT needs both SLP and loop-based vectorization. */ 1651 only_slp_in_loop = false; 1652 } 1653 } 1654 1655 if (only_slp_in_loop) 1656 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo); 1657 else 1658 vectorization_factor 1659 = least_common_multiple (vectorization_factor, 1660 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo)); 1661 1662 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor; 1663 if (dump_enabled_p ()) 1664 dump_printf_loc (MSG_NOTE, vect_location, 1665 "Updating vectorization factor to %d\n", 1666 vectorization_factor); 1667 } 1668 1669 /* Function vect_analyze_loop_operations. 1670 1671 Scan the loop stmts and make sure they are all vectorizable. */ 1672 1673 static bool 1674 vect_analyze_loop_operations (loop_vec_info loop_vinfo) 1675 { 1676 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 1677 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); 1678 int nbbs = loop->num_nodes; 1679 int i; 1680 stmt_vec_info stmt_info; 1681 bool need_to_vectorize = false; 1682 bool ok; 1683 1684 if (dump_enabled_p ()) 1685 dump_printf_loc (MSG_NOTE, vect_location, 1686 "=== vect_analyze_loop_operations ===\n"); 1687 1688 for (i = 0; i < nbbs; i++) 1689 { 1690 basic_block bb = bbs[i]; 1691 1692 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si); 1693 gsi_next (&si)) 1694 { 1695 gphi *phi = si.phi (); 1696 ok = true; 1697 1698 stmt_info = vinfo_for_stmt (phi); 1699 if (dump_enabled_p ()) 1700 { 1701 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: "); 1702 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0); 1703 } 1704 if (virtual_operand_p (gimple_phi_result (phi))) 1705 continue; 1706 1707 /* Inner-loop loop-closed exit phi in outer-loop vectorization 1708 (i.e., a phi in the tail of the outer-loop). */ 1709 if (! is_loop_header_bb_p (bb)) 1710 { 1711 /* FORNOW: we currently don't support the case that these phis 1712 are not used in the outerloop (unless it is double reduction, 1713 i.e., this phi is vect_reduction_def), cause this case 1714 requires to actually do something here. */ 1715 if ((!STMT_VINFO_RELEVANT_P (stmt_info) 1716 || STMT_VINFO_LIVE_P (stmt_info)) 1717 && STMT_VINFO_DEF_TYPE (stmt_info) 1718 != vect_double_reduction_def) 1719 { 1720 if (dump_enabled_p ()) 1721 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1722 "Unsupported loop-closed phi in " 1723 "outer-loop.\n"); 1724 return false; 1725 } 1726 1727 /* If PHI is used in the outer loop, we check that its operand 1728 is defined in the inner loop. */ 1729 if (STMT_VINFO_RELEVANT_P (stmt_info)) 1730 { 1731 tree phi_op; 1732 gimple *op_def_stmt; 1733 1734 if (gimple_phi_num_args (phi) != 1) 1735 return false; 1736 1737 phi_op = PHI_ARG_DEF (phi, 0); 1738 if (TREE_CODE (phi_op) != SSA_NAME) 1739 return false; 1740 1741 op_def_stmt = SSA_NAME_DEF_STMT (phi_op); 1742 if (gimple_nop_p (op_def_stmt) 1743 || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt)) 1744 || !vinfo_for_stmt (op_def_stmt)) 1745 return false; 1746 1747 if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt)) 1748 != vect_used_in_outer 1749 && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt)) 1750 != vect_used_in_outer_by_reduction) 1751 return false; 1752 } 1753 1754 continue; 1755 } 1756 1757 gcc_assert (stmt_info); 1758 1759 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope 1760 || STMT_VINFO_LIVE_P (stmt_info)) 1761 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def) 1762 { 1763 /* A scalar-dependence cycle that we don't support. */ 1764 if (dump_enabled_p ()) 1765 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1766 "not vectorized: scalar dependence cycle.\n"); 1767 return false; 1768 } 1769 1770 if (STMT_VINFO_RELEVANT_P (stmt_info)) 1771 { 1772 need_to_vectorize = true; 1773 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def) 1774 ok = vectorizable_induction (phi, NULL, NULL); 1775 } 1776 1777 if (ok && STMT_VINFO_LIVE_P (stmt_info)) 1778 ok = vectorizable_live_operation (phi, NULL, NULL, -1, NULL); 1779 1780 if (!ok) 1781 { 1782 if (dump_enabled_p ()) 1783 { 1784 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1785 "not vectorized: relevant phi not " 1786 "supported: "); 1787 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0); 1788 } 1789 return false; 1790 } 1791 } 1792 1793 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si); 1794 gsi_next (&si)) 1795 { 1796 gimple *stmt = gsi_stmt (si); 1797 if (!gimple_clobber_p (stmt) 1798 && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL)) 1799 return false; 1800 } 1801 } /* bbs */ 1802 1803 /* All operations in the loop are either irrelevant (deal with loop 1804 control, or dead), or only used outside the loop and can be moved 1805 out of the loop (e.g. invariants, inductions). The loop can be 1806 optimized away by scalar optimizations. We're better off not 1807 touching this loop. */ 1808 if (!need_to_vectorize) 1809 { 1810 if (dump_enabled_p ()) 1811 dump_printf_loc (MSG_NOTE, vect_location, 1812 "All the computation can be taken out of the loop.\n"); 1813 if (dump_enabled_p ()) 1814 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1815 "not vectorized: redundant loop. no profit to " 1816 "vectorize.\n"); 1817 return false; 1818 } 1819 1820 return true; 1821 } 1822 1823 1824 /* Function vect_analyze_loop_2. 1825 1826 Apply a set of analyses on LOOP, and create a loop_vec_info struct 1827 for it. The different analyses will record information in the 1828 loop_vec_info struct. */ 1829 static bool 1830 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal) 1831 { 1832 bool ok; 1833 int max_vf = MAX_VECTORIZATION_FACTOR; 1834 int min_vf = 2; 1835 unsigned int n_stmts = 0; 1836 1837 /* The first group of checks is independent of the vector size. */ 1838 fatal = true; 1839 1840 /* Find all data references in the loop (which correspond to vdefs/vuses) 1841 and analyze their evolution in the loop. */ 1842 1843 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); 1844 1845 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo); 1846 if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo))) 1847 { 1848 if (dump_enabled_p ()) 1849 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1850 "not vectorized: loop nest containing two " 1851 "or more consecutive inner loops cannot be " 1852 "vectorized\n"); 1853 return false; 1854 } 1855 1856 for (unsigned i = 0; i < loop->num_nodes; i++) 1857 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]); 1858 !gsi_end_p (gsi); gsi_next (&gsi)) 1859 { 1860 gimple *stmt = gsi_stmt (gsi); 1861 if (is_gimple_debug (stmt)) 1862 continue; 1863 ++n_stmts; 1864 if (!find_data_references_in_stmt (loop, stmt, 1865 &LOOP_VINFO_DATAREFS (loop_vinfo))) 1866 { 1867 if (is_gimple_call (stmt) && loop->safelen) 1868 { 1869 tree fndecl = gimple_call_fndecl (stmt), op; 1870 if (fndecl != NULL_TREE) 1871 { 1872 cgraph_node *node = cgraph_node::get (fndecl); 1873 if (node != NULL && node->simd_clones != NULL) 1874 { 1875 unsigned int j, n = gimple_call_num_args (stmt); 1876 for (j = 0; j < n; j++) 1877 { 1878 op = gimple_call_arg (stmt, j); 1879 if (DECL_P (op) 1880 || (REFERENCE_CLASS_P (op) 1881 && get_base_address (op))) 1882 break; 1883 } 1884 op = gimple_call_lhs (stmt); 1885 /* Ignore #pragma omp declare simd functions 1886 if they don't have data references in the 1887 call stmt itself. */ 1888 if (j == n 1889 && !(op 1890 && (DECL_P (op) 1891 || (REFERENCE_CLASS_P (op) 1892 && get_base_address (op))))) 1893 continue; 1894 } 1895 } 1896 } 1897 if (dump_enabled_p ()) 1898 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1899 "not vectorized: loop contains function " 1900 "calls or data references that cannot " 1901 "be analyzed\n"); 1902 return false; 1903 } 1904 } 1905 1906 /* Analyze the data references and also adjust the minimal 1907 vectorization factor according to the loads and stores. */ 1908 1909 ok = vect_analyze_data_refs (loop_vinfo, &min_vf); 1910 if (!ok) 1911 { 1912 if (dump_enabled_p ()) 1913 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1914 "bad data references.\n"); 1915 return false; 1916 } 1917 1918 /* Classify all cross-iteration scalar data-flow cycles. 1919 Cross-iteration cycles caused by virtual phis are analyzed separately. */ 1920 vect_analyze_scalar_cycles (loop_vinfo); 1921 1922 vect_pattern_recog (loop_vinfo); 1923 1924 vect_fixup_scalar_cycles_with_patterns (loop_vinfo); 1925 1926 /* Analyze the access patterns of the data-refs in the loop (consecutive, 1927 complex, etc.). FORNOW: Only handle consecutive access pattern. */ 1928 1929 ok = vect_analyze_data_ref_accesses (loop_vinfo); 1930 if (!ok) 1931 { 1932 if (dump_enabled_p ()) 1933 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1934 "bad data access.\n"); 1935 return false; 1936 } 1937 1938 /* Data-flow analysis to detect stmts that do not need to be vectorized. */ 1939 1940 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo); 1941 if (!ok) 1942 { 1943 if (dump_enabled_p ()) 1944 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1945 "unexpected pattern.\n"); 1946 return false; 1947 } 1948 1949 /* While the rest of the analysis below depends on it in some way. */ 1950 fatal = false; 1951 1952 /* Analyze data dependences between the data-refs in the loop 1953 and adjust the maximum vectorization factor according to 1954 the dependences. 1955 FORNOW: fail at the first data dependence that we encounter. */ 1956 1957 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf); 1958 if (!ok 1959 || max_vf < min_vf) 1960 { 1961 if (dump_enabled_p ()) 1962 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1963 "bad data dependence.\n"); 1964 return false; 1965 } 1966 1967 ok = vect_determine_vectorization_factor (loop_vinfo); 1968 if (!ok) 1969 { 1970 if (dump_enabled_p ()) 1971 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1972 "can't determine vectorization factor.\n"); 1973 return false; 1974 } 1975 if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo)) 1976 { 1977 if (dump_enabled_p ()) 1978 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1979 "bad data dependence.\n"); 1980 return false; 1981 } 1982 1983 /* Compute the scalar iteration cost. */ 1984 vect_compute_single_scalar_iteration_cost (loop_vinfo); 1985 1986 int saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 1987 HOST_WIDE_INT estimated_niter; 1988 unsigned th; 1989 int min_scalar_loop_bound; 1990 1991 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */ 1992 ok = vect_analyze_slp (loop_vinfo, n_stmts); 1993 if (!ok) 1994 return false; 1995 1996 /* If there are any SLP instances mark them as pure_slp. */ 1997 bool slp = vect_make_slp_decision (loop_vinfo); 1998 if (slp) 1999 { 2000 /* Find stmts that need to be both vectorized and SLPed. */ 2001 vect_detect_hybrid_slp (loop_vinfo); 2002 2003 /* Update the vectorization factor based on the SLP decision. */ 2004 vect_update_vf_for_slp (loop_vinfo); 2005 } 2006 2007 /* This is the point where we can re-start analysis with SLP forced off. */ 2008 start_over: 2009 2010 /* Now the vectorization factor is final. */ 2011 unsigned vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 2012 gcc_assert (vectorization_factor != 0); 2013 2014 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ()) 2015 dump_printf_loc (MSG_NOTE, vect_location, 2016 "vectorization_factor = %d, niters = " 2017 HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor, 2018 LOOP_VINFO_INT_NITERS (loop_vinfo)); 2019 2020 HOST_WIDE_INT max_niter 2021 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo)); 2022 if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) 2023 && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor)) 2024 || (max_niter != -1 2025 && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor)) 2026 { 2027 if (dump_enabled_p ()) 2028 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2029 "not vectorized: iteration count smaller than " 2030 "vectorization factor.\n"); 2031 return false; 2032 } 2033 2034 /* Analyze the alignment of the data-refs in the loop. 2035 Fail if a data reference is found that cannot be vectorized. */ 2036 2037 ok = vect_analyze_data_refs_alignment (loop_vinfo); 2038 if (!ok) 2039 { 2040 if (dump_enabled_p ()) 2041 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2042 "bad data alignment.\n"); 2043 return false; 2044 } 2045 2046 /* Prune the list of ddrs to be tested at run-time by versioning for alias. 2047 It is important to call pruning after vect_analyze_data_ref_accesses, 2048 since we use grouping information gathered by interleaving analysis. */ 2049 ok = vect_prune_runtime_alias_test_list (loop_vinfo); 2050 if (!ok) 2051 return false; 2052 2053 /* Do not invoke vect_enhance_data_refs_alignment for eplilogue 2054 vectorization. */ 2055 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)) 2056 { 2057 /* This pass will decide on using loop versioning and/or loop peeling in 2058 order to enhance the alignment of data references in the loop. */ 2059 ok = vect_enhance_data_refs_alignment (loop_vinfo); 2060 if (!ok) 2061 { 2062 if (dump_enabled_p ()) 2063 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2064 "bad data alignment.\n"); 2065 return false; 2066 } 2067 } 2068 2069 if (slp) 2070 { 2071 /* Analyze operations in the SLP instances. Note this may 2072 remove unsupported SLP instances which makes the above 2073 SLP kind detection invalid. */ 2074 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length (); 2075 vect_slp_analyze_operations (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), 2076 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)); 2077 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size) 2078 goto again; 2079 } 2080 2081 /* Scan all the remaining operations in the loop that are not subject 2082 to SLP and make sure they are vectorizable. */ 2083 ok = vect_analyze_loop_operations (loop_vinfo); 2084 if (!ok) 2085 { 2086 if (dump_enabled_p ()) 2087 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2088 "bad operation or unsupported loop bound.\n"); 2089 return false; 2090 } 2091 2092 /* If epilog loop is required because of data accesses with gaps, 2093 one additional iteration needs to be peeled. Check if there is 2094 enough iterations for vectorization. */ 2095 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) 2096 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) 2097 { 2098 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 2099 tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo); 2100 2101 if (wi::to_widest (scalar_niters) < vf) 2102 { 2103 if (dump_enabled_p ()) 2104 dump_printf_loc (MSG_NOTE, vect_location, 2105 "loop has no enough iterations to support" 2106 " peeling for gaps.\n"); 2107 return false; 2108 } 2109 } 2110 2111 /* Analyze cost. Decide if worth while to vectorize. */ 2112 int min_profitable_estimate, min_profitable_iters; 2113 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters, 2114 &min_profitable_estimate); 2115 2116 if (min_profitable_iters < 0) 2117 { 2118 if (dump_enabled_p ()) 2119 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2120 "not vectorized: vectorization not profitable.\n"); 2121 if (dump_enabled_p ()) 2122 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2123 "not vectorized: vector version will never be " 2124 "profitable.\n"); 2125 goto again; 2126 } 2127 2128 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND) 2129 * vectorization_factor) - 1); 2130 2131 /* Use the cost model only if it is more conservative than user specified 2132 threshold. */ 2133 th = (unsigned) min_scalar_loop_bound; 2134 if (min_profitable_iters 2135 && (!min_scalar_loop_bound 2136 || min_profitable_iters > min_scalar_loop_bound)) 2137 th = (unsigned) min_profitable_iters; 2138 2139 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th; 2140 2141 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) 2142 && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th) 2143 { 2144 if (dump_enabled_p ()) 2145 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2146 "not vectorized: vectorization not profitable.\n"); 2147 if (dump_enabled_p ()) 2148 dump_printf_loc (MSG_NOTE, vect_location, 2149 "not vectorized: iteration count smaller than user " 2150 "specified loop bound parameter or minimum profitable " 2151 "iterations (whichever is more conservative).\n"); 2152 goto again; 2153 } 2154 2155 estimated_niter 2156 = estimated_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo)); 2157 if (estimated_niter == -1) 2158 estimated_niter = max_niter; 2159 if (estimated_niter != -1 2160 && ((unsigned HOST_WIDE_INT) estimated_niter 2161 <= MAX (th, (unsigned)min_profitable_estimate))) 2162 { 2163 if (dump_enabled_p ()) 2164 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2165 "not vectorized: estimated iteration count too " 2166 "small.\n"); 2167 if (dump_enabled_p ()) 2168 dump_printf_loc (MSG_NOTE, vect_location, 2169 "not vectorized: estimated iteration count smaller " 2170 "than specified loop bound parameter or minimum " 2171 "profitable iterations (whichever is more " 2172 "conservative).\n"); 2173 goto again; 2174 } 2175 2176 /* Decide whether we need to create an epilogue loop to handle 2177 remaining scalar iterations. */ 2178 th = ((LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) + 1) 2179 / LOOP_VINFO_VECT_FACTOR (loop_vinfo)) 2180 * LOOP_VINFO_VECT_FACTOR (loop_vinfo); 2181 2182 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) 2183 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0) 2184 { 2185 if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo) 2186 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)) 2187 < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo))) 2188 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true; 2189 } 2190 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) 2191 || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo)) 2192 < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)) 2193 /* In case of versioning, check if the maximum number of 2194 iterations is greater than th. If they are identical, 2195 the epilogue is unnecessary. */ 2196 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo) 2197 || (unsigned HOST_WIDE_INT) max_niter > th))) 2198 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true; 2199 2200 /* If an epilogue loop is required make sure we can create one. */ 2201 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) 2202 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)) 2203 { 2204 if (dump_enabled_p ()) 2205 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n"); 2206 if (!vect_can_advance_ivs_p (loop_vinfo) 2207 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo), 2208 single_exit (LOOP_VINFO_LOOP 2209 (loop_vinfo)))) 2210 { 2211 if (dump_enabled_p ()) 2212 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2213 "not vectorized: can't create required " 2214 "epilog loop\n"); 2215 goto again; 2216 } 2217 } 2218 2219 gcc_assert (vectorization_factor 2220 == (unsigned)LOOP_VINFO_VECT_FACTOR (loop_vinfo)); 2221 2222 /* Ok to vectorize! */ 2223 return true; 2224 2225 again: 2226 /* Try again with SLP forced off but if we didn't do any SLP there is 2227 no point in re-trying. */ 2228 if (!slp) 2229 return false; 2230 2231 /* If there are reduction chains re-trying will fail anyway. */ 2232 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ()) 2233 return false; 2234 2235 /* Likewise if the grouped loads or stores in the SLP cannot be handled 2236 via interleaving or lane instructions. */ 2237 slp_instance instance; 2238 slp_tree node; 2239 unsigned i, j; 2240 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance) 2241 { 2242 stmt_vec_info vinfo; 2243 vinfo = vinfo_for_stmt 2244 (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]); 2245 if (! STMT_VINFO_GROUPED_ACCESS (vinfo)) 2246 continue; 2247 vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo)); 2248 unsigned int size = STMT_VINFO_GROUP_SIZE (vinfo); 2249 tree vectype = STMT_VINFO_VECTYPE (vinfo); 2250 if (! vect_store_lanes_supported (vectype, size) 2251 && ! vect_grouped_store_supported (vectype, size)) 2252 return false; 2253 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node) 2254 { 2255 vinfo = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]); 2256 vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo)); 2257 bool single_element_p = !STMT_VINFO_GROUP_NEXT_ELEMENT (vinfo); 2258 size = STMT_VINFO_GROUP_SIZE (vinfo); 2259 vectype = STMT_VINFO_VECTYPE (vinfo); 2260 if (! vect_load_lanes_supported (vectype, size) 2261 && ! vect_grouped_load_supported (vectype, single_element_p, 2262 size)) 2263 return false; 2264 } 2265 } 2266 2267 if (dump_enabled_p ()) 2268 dump_printf_loc (MSG_NOTE, vect_location, 2269 "re-trying with SLP disabled\n"); 2270 2271 /* Roll back state appropriately. No SLP this time. */ 2272 slp = false; 2273 /* Restore vectorization factor as it were without SLP. */ 2274 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor; 2275 /* Free the SLP instances. */ 2276 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance) 2277 vect_free_slp_instance (instance); 2278 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release (); 2279 /* Reset SLP type to loop_vect on all stmts. */ 2280 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i) 2281 { 2282 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i]; 2283 for (gimple_stmt_iterator si = gsi_start_bb (bb); 2284 !gsi_end_p (si); gsi_next (&si)) 2285 { 2286 stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si)); 2287 STMT_SLP_TYPE (stmt_info) = loop_vect; 2288 if (STMT_VINFO_IN_PATTERN_P (stmt_info)) 2289 { 2290 stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info)); 2291 STMT_SLP_TYPE (stmt_info) = loop_vect; 2292 for (gimple_stmt_iterator pi 2293 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info)); 2294 !gsi_end_p (pi); gsi_next (&pi)) 2295 { 2296 gimple *pstmt = gsi_stmt (pi); 2297 STMT_SLP_TYPE (vinfo_for_stmt (pstmt)) = loop_vect; 2298 } 2299 } 2300 } 2301 } 2302 /* Free optimized alias test DDRS. */ 2303 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release (); 2304 /* Reset target cost data. */ 2305 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)); 2306 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo) 2307 = init_cost (LOOP_VINFO_LOOP (loop_vinfo)); 2308 /* Reset assorted flags. */ 2309 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false; 2310 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false; 2311 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0; 2312 2313 goto start_over; 2314 } 2315 2316 /* Function vect_analyze_loop. 2317 2318 Apply a set of analyses on LOOP, and create a loop_vec_info struct 2319 for it. The different analyses will record information in the 2320 loop_vec_info struct. If ORIG_LOOP_VINFO is not NULL epilogue must 2321 be vectorized. */ 2322 loop_vec_info 2323 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo) 2324 { 2325 loop_vec_info loop_vinfo; 2326 unsigned int vector_sizes; 2327 2328 /* Autodetect first vector size we try. */ 2329 current_vector_size = 0; 2330 vector_sizes = targetm.vectorize.autovectorize_vector_sizes (); 2331 2332 if (dump_enabled_p ()) 2333 dump_printf_loc (MSG_NOTE, vect_location, 2334 "===== analyze_loop_nest =====\n"); 2335 2336 if (loop_outer (loop) 2337 && loop_vec_info_for_loop (loop_outer (loop)) 2338 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop)))) 2339 { 2340 if (dump_enabled_p ()) 2341 dump_printf_loc (MSG_NOTE, vect_location, 2342 "outer-loop already vectorized.\n"); 2343 return NULL; 2344 } 2345 2346 while (1) 2347 { 2348 /* Check the CFG characteristics of the loop (nesting, entry/exit). */ 2349 loop_vinfo = vect_analyze_loop_form (loop); 2350 if (!loop_vinfo) 2351 { 2352 if (dump_enabled_p ()) 2353 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2354 "bad loop form.\n"); 2355 return NULL; 2356 } 2357 2358 bool fatal = false; 2359 2360 if (orig_loop_vinfo) 2361 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo; 2362 2363 if (vect_analyze_loop_2 (loop_vinfo, fatal)) 2364 { 2365 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1; 2366 2367 return loop_vinfo; 2368 } 2369 2370 destroy_loop_vec_info (loop_vinfo, true); 2371 2372 vector_sizes &= ~current_vector_size; 2373 if (fatal 2374 || vector_sizes == 0 2375 || current_vector_size == 0) 2376 return NULL; 2377 2378 /* Try the next biggest vector size. */ 2379 current_vector_size = 1 << floor_log2 (vector_sizes); 2380 if (dump_enabled_p ()) 2381 dump_printf_loc (MSG_NOTE, vect_location, 2382 "***** Re-trying analysis with " 2383 "vector size %d\n", current_vector_size); 2384 } 2385 } 2386 2387 2388 /* Function reduction_code_for_scalar_code 2389 2390 Input: 2391 CODE - tree_code of a reduction operations. 2392 2393 Output: 2394 REDUC_CODE - the corresponding tree-code to be used to reduce the 2395 vector of partial results into a single scalar result, or ERROR_MARK 2396 if the operation is a supported reduction operation, but does not have 2397 such a tree-code. 2398 2399 Return FALSE if CODE currently cannot be vectorized as reduction. */ 2400 2401 static bool 2402 reduction_code_for_scalar_code (enum tree_code code, 2403 enum tree_code *reduc_code) 2404 { 2405 switch (code) 2406 { 2407 case MAX_EXPR: 2408 *reduc_code = REDUC_MAX_EXPR; 2409 return true; 2410 2411 case MIN_EXPR: 2412 *reduc_code = REDUC_MIN_EXPR; 2413 return true; 2414 2415 case PLUS_EXPR: 2416 *reduc_code = REDUC_PLUS_EXPR; 2417 return true; 2418 2419 case MULT_EXPR: 2420 case MINUS_EXPR: 2421 case BIT_IOR_EXPR: 2422 case BIT_XOR_EXPR: 2423 case BIT_AND_EXPR: 2424 *reduc_code = ERROR_MARK; 2425 return true; 2426 2427 default: 2428 return false; 2429 } 2430 } 2431 2432 2433 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement 2434 STMT is printed with a message MSG. */ 2435 2436 static void 2437 report_vect_op (int msg_type, gimple *stmt, const char *msg) 2438 { 2439 dump_printf_loc (msg_type, vect_location, "%s", msg); 2440 dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0); 2441 } 2442 2443 2444 /* Detect SLP reduction of the form: 2445 2446 #a1 = phi <a5, a0> 2447 a2 = operation (a1) 2448 a3 = operation (a2) 2449 a4 = operation (a3) 2450 a5 = operation (a4) 2451 2452 #a = phi <a5> 2453 2454 PHI is the reduction phi node (#a1 = phi <a5, a0> above) 2455 FIRST_STMT is the first reduction stmt in the chain 2456 (a2 = operation (a1)). 2457 2458 Return TRUE if a reduction chain was detected. */ 2459 2460 static bool 2461 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi, 2462 gimple *first_stmt) 2463 { 2464 struct loop *loop = (gimple_bb (phi))->loop_father; 2465 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info); 2466 enum tree_code code; 2467 gimple *current_stmt = NULL, *loop_use_stmt = NULL, *first, *next_stmt; 2468 stmt_vec_info use_stmt_info, current_stmt_info; 2469 tree lhs; 2470 imm_use_iterator imm_iter; 2471 use_operand_p use_p; 2472 int nloop_uses, size = 0, n_out_of_loop_uses; 2473 bool found = false; 2474 2475 if (loop != vect_loop) 2476 return false; 2477 2478 lhs = PHI_RESULT (phi); 2479 code = gimple_assign_rhs_code (first_stmt); 2480 while (1) 2481 { 2482 nloop_uses = 0; 2483 n_out_of_loop_uses = 0; 2484 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs) 2485 { 2486 gimple *use_stmt = USE_STMT (use_p); 2487 if (is_gimple_debug (use_stmt)) 2488 continue; 2489 2490 /* Check if we got back to the reduction phi. */ 2491 if (use_stmt == phi) 2492 { 2493 loop_use_stmt = use_stmt; 2494 found = true; 2495 break; 2496 } 2497 2498 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))) 2499 { 2500 loop_use_stmt = use_stmt; 2501 nloop_uses++; 2502 } 2503 else 2504 n_out_of_loop_uses++; 2505 2506 /* There are can be either a single use in the loop or two uses in 2507 phi nodes. */ 2508 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses)) 2509 return false; 2510 } 2511 2512 if (found) 2513 break; 2514 2515 /* We reached a statement with no loop uses. */ 2516 if (nloop_uses == 0) 2517 return false; 2518 2519 /* This is a loop exit phi, and we haven't reached the reduction phi. */ 2520 if (gimple_code (loop_use_stmt) == GIMPLE_PHI) 2521 return false; 2522 2523 if (!is_gimple_assign (loop_use_stmt) 2524 || code != gimple_assign_rhs_code (loop_use_stmt) 2525 || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt))) 2526 return false; 2527 2528 /* Insert USE_STMT into reduction chain. */ 2529 use_stmt_info = vinfo_for_stmt (loop_use_stmt); 2530 if (current_stmt) 2531 { 2532 current_stmt_info = vinfo_for_stmt (current_stmt); 2533 GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt; 2534 GROUP_FIRST_ELEMENT (use_stmt_info) 2535 = GROUP_FIRST_ELEMENT (current_stmt_info); 2536 } 2537 else 2538 GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt; 2539 2540 lhs = gimple_assign_lhs (loop_use_stmt); 2541 current_stmt = loop_use_stmt; 2542 size++; 2543 } 2544 2545 if (!found || loop_use_stmt != phi || size < 2) 2546 return false; 2547 2548 /* Swap the operands, if needed, to make the reduction operand be the second 2549 operand. */ 2550 lhs = PHI_RESULT (phi); 2551 next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt)); 2552 while (next_stmt) 2553 { 2554 if (gimple_assign_rhs2 (next_stmt) == lhs) 2555 { 2556 tree op = gimple_assign_rhs1 (next_stmt); 2557 gimple *def_stmt = NULL; 2558 2559 if (TREE_CODE (op) == SSA_NAME) 2560 def_stmt = SSA_NAME_DEF_STMT (op); 2561 2562 /* Check that the other def is either defined in the loop 2563 ("vect_internal_def"), or it's an induction (defined by a 2564 loop-header phi-node). */ 2565 if (def_stmt 2566 && gimple_bb (def_stmt) 2567 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)) 2568 && (is_gimple_assign (def_stmt) 2569 || is_gimple_call (def_stmt) 2570 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt)) 2571 == vect_induction_def 2572 || (gimple_code (def_stmt) == GIMPLE_PHI 2573 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt)) 2574 == vect_internal_def 2575 && !is_loop_header_bb_p (gimple_bb (def_stmt))))) 2576 { 2577 lhs = gimple_assign_lhs (next_stmt); 2578 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt)); 2579 continue; 2580 } 2581 2582 return false; 2583 } 2584 else 2585 { 2586 tree op = gimple_assign_rhs2 (next_stmt); 2587 gimple *def_stmt = NULL; 2588 2589 if (TREE_CODE (op) == SSA_NAME) 2590 def_stmt = SSA_NAME_DEF_STMT (op); 2591 2592 /* Check that the other def is either defined in the loop 2593 ("vect_internal_def"), or it's an induction (defined by a 2594 loop-header phi-node). */ 2595 if (def_stmt 2596 && gimple_bb (def_stmt) 2597 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)) 2598 && (is_gimple_assign (def_stmt) 2599 || is_gimple_call (def_stmt) 2600 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt)) 2601 == vect_induction_def 2602 || (gimple_code (def_stmt) == GIMPLE_PHI 2603 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt)) 2604 == vect_internal_def 2605 && !is_loop_header_bb_p (gimple_bb (def_stmt))))) 2606 { 2607 if (dump_enabled_p ()) 2608 { 2609 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: "); 2610 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0); 2611 } 2612 2613 swap_ssa_operands (next_stmt, 2614 gimple_assign_rhs1_ptr (next_stmt), 2615 gimple_assign_rhs2_ptr (next_stmt)); 2616 update_stmt (next_stmt); 2617 2618 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt))) 2619 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true; 2620 } 2621 else 2622 return false; 2623 } 2624 2625 lhs = gimple_assign_lhs (next_stmt); 2626 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt)); 2627 } 2628 2629 /* Save the chain for further analysis in SLP detection. */ 2630 first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt)); 2631 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first); 2632 GROUP_SIZE (vinfo_for_stmt (first)) = size; 2633 2634 return true; 2635 } 2636 2637 2638 /* Function vect_is_simple_reduction_1 2639 2640 (1) Detect a cross-iteration def-use cycle that represents a simple 2641 reduction computation. We look for the following pattern: 2642 2643 loop_header: 2644 a1 = phi < a0, a2 > 2645 a3 = ... 2646 a2 = operation (a3, a1) 2647 2648 or 2649 2650 a3 = ... 2651 loop_header: 2652 a1 = phi < a0, a2 > 2653 a2 = operation (a3, a1) 2654 2655 such that: 2656 1. operation is commutative and associative and it is safe to 2657 change the order of the computation (if CHECK_REDUCTION is true) 2658 2. no uses for a2 in the loop (a2 is used out of the loop) 2659 3. no uses of a1 in the loop besides the reduction operation 2660 4. no uses of a1 outside the loop. 2661 2662 Conditions 1,4 are tested here. 2663 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized. 2664 2665 (2) Detect a cross-iteration def-use cycle in nested loops, i.e., 2666 nested cycles, if CHECK_REDUCTION is false. 2667 2668 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double 2669 reductions: 2670 2671 a1 = phi < a0, a2 > 2672 inner loop (def of a3) 2673 a2 = phi < a3 > 2674 2675 (4) Detect condition expressions, ie: 2676 for (int i = 0; i < N; i++) 2677 if (a[i] < val) 2678 ret_val = a[i]; 2679 2680 */ 2681 2682 static gimple * 2683 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi, 2684 bool check_reduction, bool *double_reduc, 2685 bool need_wrapping_integral_overflow, 2686 enum vect_reduction_type *v_reduc_type) 2687 { 2688 struct loop *loop = (gimple_bb (phi))->loop_father; 2689 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info); 2690 edge latch_e = loop_latch_edge (loop); 2691 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e); 2692 gimple *def_stmt, *def1 = NULL, *def2 = NULL, *phi_use_stmt = NULL; 2693 enum tree_code orig_code, code; 2694 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE; 2695 tree type; 2696 int nloop_uses; 2697 tree name; 2698 imm_use_iterator imm_iter; 2699 use_operand_p use_p; 2700 bool phi_def; 2701 2702 *double_reduc = false; 2703 *v_reduc_type = TREE_CODE_REDUCTION; 2704 2705 /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization, 2706 otherwise, we assume outer loop vectorization. */ 2707 gcc_assert ((check_reduction && loop == vect_loop) 2708 || (!check_reduction && flow_loop_nested_p (vect_loop, loop))); 2709 2710 name = PHI_RESULT (phi); 2711 /* ??? If there are no uses of the PHI result the inner loop reduction 2712 won't be detected as possibly double-reduction by vectorizable_reduction 2713 because that tries to walk the PHI arg from the preheader edge which 2714 can be constant. See PR60382. */ 2715 if (has_zero_uses (name)) 2716 return NULL; 2717 nloop_uses = 0; 2718 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name) 2719 { 2720 gimple *use_stmt = USE_STMT (use_p); 2721 if (is_gimple_debug (use_stmt)) 2722 continue; 2723 2724 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))) 2725 { 2726 if (dump_enabled_p ()) 2727 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2728 "intermediate value used outside loop.\n"); 2729 2730 return NULL; 2731 } 2732 2733 nloop_uses++; 2734 if (nloop_uses > 1) 2735 { 2736 if (dump_enabled_p ()) 2737 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2738 "reduction used in loop.\n"); 2739 return NULL; 2740 } 2741 2742 phi_use_stmt = use_stmt; 2743 } 2744 2745 if (TREE_CODE (loop_arg) != SSA_NAME) 2746 { 2747 if (dump_enabled_p ()) 2748 { 2749 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2750 "reduction: not ssa_name: "); 2751 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg); 2752 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 2753 } 2754 return NULL; 2755 } 2756 2757 def_stmt = SSA_NAME_DEF_STMT (loop_arg); 2758 if (!def_stmt) 2759 { 2760 if (dump_enabled_p ()) 2761 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2762 "reduction: no def_stmt.\n"); 2763 return NULL; 2764 } 2765 2766 if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI) 2767 { 2768 if (dump_enabled_p ()) 2769 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0); 2770 return NULL; 2771 } 2772 2773 if (is_gimple_assign (def_stmt)) 2774 { 2775 name = gimple_assign_lhs (def_stmt); 2776 phi_def = false; 2777 } 2778 else 2779 { 2780 name = PHI_RESULT (def_stmt); 2781 phi_def = true; 2782 } 2783 2784 nloop_uses = 0; 2785 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name) 2786 { 2787 gimple *use_stmt = USE_STMT (use_p); 2788 if (is_gimple_debug (use_stmt)) 2789 continue; 2790 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))) 2791 nloop_uses++; 2792 if (nloop_uses > 1) 2793 { 2794 if (dump_enabled_p ()) 2795 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2796 "reduction used in loop.\n"); 2797 return NULL; 2798 } 2799 } 2800 2801 /* If DEF_STMT is a phi node itself, we expect it to have a single argument 2802 defined in the inner loop. */ 2803 if (phi_def) 2804 { 2805 op1 = PHI_ARG_DEF (def_stmt, 0); 2806 2807 if (gimple_phi_num_args (def_stmt) != 1 2808 || TREE_CODE (op1) != SSA_NAME) 2809 { 2810 if (dump_enabled_p ()) 2811 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2812 "unsupported phi node definition.\n"); 2813 2814 return NULL; 2815 } 2816 2817 def1 = SSA_NAME_DEF_STMT (op1); 2818 if (gimple_bb (def1) 2819 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)) 2820 && loop->inner 2821 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1)) 2822 && is_gimple_assign (def1) 2823 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt))) 2824 { 2825 if (dump_enabled_p ()) 2826 report_vect_op (MSG_NOTE, def_stmt, 2827 "detected double reduction: "); 2828 2829 *double_reduc = true; 2830 return def_stmt; 2831 } 2832 2833 return NULL; 2834 } 2835 2836 code = orig_code = gimple_assign_rhs_code (def_stmt); 2837 2838 /* We can handle "res -= x[i]", which is non-associative by 2839 simply rewriting this into "res += -x[i]". Avoid changing 2840 gimple instruction for the first simple tests and only do this 2841 if we're allowed to change code at all. */ 2842 if (code == MINUS_EXPR 2843 && (op1 = gimple_assign_rhs1 (def_stmt)) 2844 && TREE_CODE (op1) == SSA_NAME 2845 && SSA_NAME_DEF_STMT (op1) == phi) 2846 code = PLUS_EXPR; 2847 2848 if (code == COND_EXPR) 2849 { 2850 if (check_reduction) 2851 *v_reduc_type = COND_REDUCTION; 2852 } 2853 else if (!commutative_tree_code (code) || !associative_tree_code (code)) 2854 { 2855 if (dump_enabled_p ()) 2856 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt, 2857 "reduction: not commutative/associative: "); 2858 return NULL; 2859 } 2860 2861 if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS) 2862 { 2863 if (code != COND_EXPR) 2864 { 2865 if (dump_enabled_p ()) 2866 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt, 2867 "reduction: not binary operation: "); 2868 2869 return NULL; 2870 } 2871 2872 op3 = gimple_assign_rhs1 (def_stmt); 2873 if (COMPARISON_CLASS_P (op3)) 2874 { 2875 op4 = TREE_OPERAND (op3, 1); 2876 op3 = TREE_OPERAND (op3, 0); 2877 } 2878 2879 op1 = gimple_assign_rhs2 (def_stmt); 2880 op2 = gimple_assign_rhs3 (def_stmt); 2881 2882 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME) 2883 { 2884 if (dump_enabled_p ()) 2885 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt, 2886 "reduction: uses not ssa_names: "); 2887 2888 return NULL; 2889 } 2890 } 2891 else 2892 { 2893 op1 = gimple_assign_rhs1 (def_stmt); 2894 op2 = gimple_assign_rhs2 (def_stmt); 2895 2896 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME) 2897 { 2898 if (dump_enabled_p ()) 2899 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt, 2900 "reduction: uses not ssa_names: "); 2901 2902 return NULL; 2903 } 2904 } 2905 2906 type = TREE_TYPE (gimple_assign_lhs (def_stmt)); 2907 if ((TREE_CODE (op1) == SSA_NAME 2908 && !types_compatible_p (type,TREE_TYPE (op1))) 2909 || (TREE_CODE (op2) == SSA_NAME 2910 && !types_compatible_p (type, TREE_TYPE (op2))) 2911 || (op3 && TREE_CODE (op3) == SSA_NAME 2912 && !types_compatible_p (type, TREE_TYPE (op3))) 2913 || (op4 && TREE_CODE (op4) == SSA_NAME 2914 && !types_compatible_p (type, TREE_TYPE (op4)))) 2915 { 2916 if (dump_enabled_p ()) 2917 { 2918 dump_printf_loc (MSG_NOTE, vect_location, 2919 "reduction: multiple types: operation type: "); 2920 dump_generic_expr (MSG_NOTE, TDF_SLIM, type); 2921 dump_printf (MSG_NOTE, ", operands types: "); 2922 dump_generic_expr (MSG_NOTE, TDF_SLIM, 2923 TREE_TYPE (op1)); 2924 dump_printf (MSG_NOTE, ","); 2925 dump_generic_expr (MSG_NOTE, TDF_SLIM, 2926 TREE_TYPE (op2)); 2927 if (op3) 2928 { 2929 dump_printf (MSG_NOTE, ","); 2930 dump_generic_expr (MSG_NOTE, TDF_SLIM, 2931 TREE_TYPE (op3)); 2932 } 2933 2934 if (op4) 2935 { 2936 dump_printf (MSG_NOTE, ","); 2937 dump_generic_expr (MSG_NOTE, TDF_SLIM, 2938 TREE_TYPE (op4)); 2939 } 2940 dump_printf (MSG_NOTE, "\n"); 2941 } 2942 2943 return NULL; 2944 } 2945 2946 /* Check that it's ok to change the order of the computation. 2947 Generally, when vectorizing a reduction we change the order of the 2948 computation. This may change the behavior of the program in some 2949 cases, so we need to check that this is ok. One exception is when 2950 vectorizing an outer-loop: the inner-loop is executed sequentially, 2951 and therefore vectorizing reductions in the inner-loop during 2952 outer-loop vectorization is safe. */ 2953 2954 if (*v_reduc_type != COND_REDUCTION 2955 && check_reduction) 2956 { 2957 /* CHECKME: check for !flag_finite_math_only too? */ 2958 if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math) 2959 { 2960 /* Changing the order of operations changes the semantics. */ 2961 if (dump_enabled_p ()) 2962 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt, 2963 "reduction: unsafe fp math optimization: "); 2964 return NULL; 2965 } 2966 else if (INTEGRAL_TYPE_P (type)) 2967 { 2968 if (!operation_no_trapping_overflow (type, code)) 2969 { 2970 /* Changing the order of operations changes the semantics. */ 2971 if (dump_enabled_p ()) 2972 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt, 2973 "reduction: unsafe int math optimization" 2974 " (overflow traps): "); 2975 return NULL; 2976 } 2977 if (need_wrapping_integral_overflow 2978 && !TYPE_OVERFLOW_WRAPS (type) 2979 && operation_can_overflow (code)) 2980 { 2981 /* Changing the order of operations changes the semantics. */ 2982 if (dump_enabled_p ()) 2983 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt, 2984 "reduction: unsafe int math optimization" 2985 " (overflow doesn't wrap): "); 2986 return NULL; 2987 } 2988 } 2989 else if (SAT_FIXED_POINT_TYPE_P (type)) 2990 { 2991 /* Changing the order of operations changes the semantics. */ 2992 if (dump_enabled_p ()) 2993 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt, 2994 "reduction: unsafe fixed-point math optimization: "); 2995 return NULL; 2996 } 2997 } 2998 2999 /* Reduction is safe. We're dealing with one of the following: 3000 1) integer arithmetic and no trapv 3001 2) floating point arithmetic, and special flags permit this optimization 3002 3) nested cycle (i.e., outer loop vectorization). */ 3003 if (TREE_CODE (op1) == SSA_NAME) 3004 def1 = SSA_NAME_DEF_STMT (op1); 3005 3006 if (TREE_CODE (op2) == SSA_NAME) 3007 def2 = SSA_NAME_DEF_STMT (op2); 3008 3009 if (code != COND_EXPR 3010 && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2)))) 3011 { 3012 if (dump_enabled_p ()) 3013 report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: "); 3014 return NULL; 3015 } 3016 3017 /* Check that one def is the reduction def, defined by PHI, 3018 the other def is either defined in the loop ("vect_internal_def"), 3019 or it's an induction (defined by a loop-header phi-node). */ 3020 3021 if (def2 && def2 == phi 3022 && (code == COND_EXPR 3023 || !def1 || gimple_nop_p (def1) 3024 || !flow_bb_inside_loop_p (loop, gimple_bb (def1)) 3025 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1)) 3026 && (is_gimple_assign (def1) 3027 || is_gimple_call (def1) 3028 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1)) 3029 == vect_induction_def 3030 || (gimple_code (def1) == GIMPLE_PHI 3031 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1)) 3032 == vect_internal_def 3033 && !is_loop_header_bb_p (gimple_bb (def1))))))) 3034 { 3035 if (dump_enabled_p ()) 3036 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: "); 3037 return def_stmt; 3038 } 3039 3040 if (def1 && def1 == phi 3041 && (code == COND_EXPR 3042 || !def2 || gimple_nop_p (def2) 3043 || !flow_bb_inside_loop_p (loop, gimple_bb (def2)) 3044 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2)) 3045 && (is_gimple_assign (def2) 3046 || is_gimple_call (def2) 3047 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2)) 3048 == vect_induction_def 3049 || (gimple_code (def2) == GIMPLE_PHI 3050 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2)) 3051 == vect_internal_def 3052 && !is_loop_header_bb_p (gimple_bb (def2))))))) 3053 { 3054 if (check_reduction && orig_code != MINUS_EXPR) 3055 { 3056 /* Check if we can swap operands (just for simplicity - so that 3057 the rest of the code can assume that the reduction variable 3058 is always the last (second) argument). */ 3059 if (code == COND_EXPR) 3060 { 3061 /* Swap cond_expr by inverting the condition. */ 3062 tree cond_expr = gimple_assign_rhs1 (def_stmt); 3063 enum tree_code invert_code = ERROR_MARK; 3064 enum tree_code cond_code = TREE_CODE (cond_expr); 3065 3066 if (TREE_CODE_CLASS (cond_code) == tcc_comparison) 3067 { 3068 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0)); 3069 invert_code = invert_tree_comparison (cond_code, honor_nans); 3070 } 3071 if (invert_code != ERROR_MARK) 3072 { 3073 TREE_SET_CODE (cond_expr, invert_code); 3074 swap_ssa_operands (def_stmt, 3075 gimple_assign_rhs2_ptr (def_stmt), 3076 gimple_assign_rhs3_ptr (def_stmt)); 3077 } 3078 else 3079 { 3080 if (dump_enabled_p ()) 3081 report_vect_op (MSG_NOTE, def_stmt, 3082 "detected reduction: cannot swap operands " 3083 "for cond_expr"); 3084 return NULL; 3085 } 3086 } 3087 else 3088 swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt), 3089 gimple_assign_rhs2_ptr (def_stmt)); 3090 3091 if (dump_enabled_p ()) 3092 report_vect_op (MSG_NOTE, def_stmt, 3093 "detected reduction: need to swap operands: "); 3094 3095 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt))) 3096 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true; 3097 } 3098 else 3099 { 3100 if (dump_enabled_p ()) 3101 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: "); 3102 } 3103 3104 return def_stmt; 3105 } 3106 3107 /* Try to find SLP reduction chain. */ 3108 if (check_reduction && code != COND_EXPR 3109 && vect_is_slp_reduction (loop_info, phi, def_stmt)) 3110 { 3111 if (dump_enabled_p ()) 3112 report_vect_op (MSG_NOTE, def_stmt, 3113 "reduction: detected reduction chain: "); 3114 3115 return def_stmt; 3116 } 3117 3118 if (dump_enabled_p ()) 3119 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt, 3120 "reduction: unknown pattern: "); 3121 3122 return NULL; 3123 } 3124 3125 /* Wrapper around vect_is_simple_reduction_1, which will modify code 3126 in-place if it enables detection of more reductions. Arguments 3127 as there. */ 3128 3129 gimple * 3130 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi, 3131 bool check_reduction, bool *double_reduc, 3132 bool need_wrapping_integral_overflow) 3133 { 3134 enum vect_reduction_type v_reduc_type; 3135 return vect_is_simple_reduction (loop_info, phi, check_reduction, 3136 double_reduc, 3137 need_wrapping_integral_overflow, 3138 &v_reduc_type); 3139 } 3140 3141 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */ 3142 int 3143 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue, 3144 int *peel_iters_epilogue, 3145 stmt_vector_for_cost *scalar_cost_vec, 3146 stmt_vector_for_cost *prologue_cost_vec, 3147 stmt_vector_for_cost *epilogue_cost_vec) 3148 { 3149 int retval = 0; 3150 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 3151 3152 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) 3153 { 3154 *peel_iters_epilogue = vf/2; 3155 if (dump_enabled_p ()) 3156 dump_printf_loc (MSG_NOTE, vect_location, 3157 "cost model: epilogue peel iters set to vf/2 " 3158 "because loop iterations are unknown .\n"); 3159 3160 /* If peeled iterations are known but number of scalar loop 3161 iterations are unknown, count a taken branch per peeled loop. */ 3162 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken, 3163 NULL, 0, vect_prologue); 3164 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken, 3165 NULL, 0, vect_epilogue); 3166 } 3167 else 3168 { 3169 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo); 3170 peel_iters_prologue = niters < peel_iters_prologue ? 3171 niters : peel_iters_prologue; 3172 *peel_iters_epilogue = (niters - peel_iters_prologue) % vf; 3173 /* If we need to peel for gaps, but no peeling is required, we have to 3174 peel VF iterations. */ 3175 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue) 3176 *peel_iters_epilogue = vf; 3177 } 3178 3179 stmt_info_for_cost *si; 3180 int j; 3181 if (peel_iters_prologue) 3182 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si) 3183 { 3184 stmt_vec_info stmt_info 3185 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL; 3186 retval += record_stmt_cost (prologue_cost_vec, 3187 si->count * peel_iters_prologue, 3188 si->kind, stmt_info, si->misalign, 3189 vect_prologue); 3190 } 3191 if (*peel_iters_epilogue) 3192 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si) 3193 { 3194 stmt_vec_info stmt_info 3195 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL; 3196 retval += record_stmt_cost (epilogue_cost_vec, 3197 si->count * *peel_iters_epilogue, 3198 si->kind, stmt_info, si->misalign, 3199 vect_epilogue); 3200 } 3201 3202 return retval; 3203 } 3204 3205 /* Function vect_estimate_min_profitable_iters 3206 3207 Return the number of iterations required for the vector version of the 3208 loop to be profitable relative to the cost of the scalar version of the 3209 loop. 3210 3211 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold 3212 of iterations for vectorization. -1 value means loop vectorization 3213 is not profitable. This returned value may be used for dynamic 3214 profitability check. 3215 3216 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used 3217 for static check against estimated number of iterations. */ 3218 3219 static void 3220 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo, 3221 int *ret_min_profitable_niters, 3222 int *ret_min_profitable_estimate) 3223 { 3224 int min_profitable_iters; 3225 int min_profitable_estimate; 3226 int peel_iters_prologue; 3227 int peel_iters_epilogue; 3228 unsigned vec_inside_cost = 0; 3229 int vec_outside_cost = 0; 3230 unsigned vec_prologue_cost = 0; 3231 unsigned vec_epilogue_cost = 0; 3232 int scalar_single_iter_cost = 0; 3233 int scalar_outside_cost = 0; 3234 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 3235 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); 3236 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo); 3237 3238 /* Cost model disabled. */ 3239 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo))) 3240 { 3241 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n"); 3242 *ret_min_profitable_niters = 0; 3243 *ret_min_profitable_estimate = 0; 3244 return; 3245 } 3246 3247 /* Requires loop versioning tests to handle misalignment. */ 3248 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)) 3249 { 3250 /* FIXME: Make cost depend on complexity of individual check. */ 3251 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length (); 3252 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0, 3253 vect_prologue); 3254 dump_printf (MSG_NOTE, 3255 "cost model: Adding cost of checks for loop " 3256 "versioning to treat misalignment.\n"); 3257 } 3258 3259 /* Requires loop versioning with alias checks. */ 3260 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)) 3261 { 3262 /* FIXME: Make cost depend on complexity of individual check. */ 3263 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length (); 3264 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0, 3265 vect_prologue); 3266 dump_printf (MSG_NOTE, 3267 "cost model: Adding cost of checks for loop " 3268 "versioning aliasing.\n"); 3269 } 3270 3271 /* Requires loop versioning with niter checks. */ 3272 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo)) 3273 { 3274 /* FIXME: Make cost depend on complexity of individual check. */ 3275 (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0, 3276 vect_prologue); 3277 dump_printf (MSG_NOTE, 3278 "cost model: Adding cost of checks for loop " 3279 "versioning niters.\n"); 3280 } 3281 3282 if (LOOP_REQUIRES_VERSIONING (loop_vinfo)) 3283 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0, 3284 vect_prologue); 3285 3286 /* Count statements in scalar loop. Using this as scalar cost for a single 3287 iteration for now. 3288 3289 TODO: Add outer loop support. 3290 3291 TODO: Consider assigning different costs to different scalar 3292 statements. */ 3293 3294 scalar_single_iter_cost 3295 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo); 3296 3297 /* Add additional cost for the peeled instructions in prologue and epilogue 3298 loop. 3299 3300 FORNOW: If we don't know the value of peel_iters for prologue or epilogue 3301 at compile-time - we assume it's vf/2 (the worst would be vf-1). 3302 3303 TODO: Build an expression that represents peel_iters for prologue and 3304 epilogue to be used in a run-time test. */ 3305 3306 if (npeel < 0) 3307 { 3308 peel_iters_prologue = vf/2; 3309 dump_printf (MSG_NOTE, "cost model: " 3310 "prologue peel iters set to vf/2.\n"); 3311 3312 /* If peeling for alignment is unknown, loop bound of main loop becomes 3313 unknown. */ 3314 peel_iters_epilogue = vf/2; 3315 dump_printf (MSG_NOTE, "cost model: " 3316 "epilogue peel iters set to vf/2 because " 3317 "peeling for alignment is unknown.\n"); 3318 3319 /* If peeled iterations are unknown, count a taken branch and a not taken 3320 branch per peeled loop. Even if scalar loop iterations are known, 3321 vector iterations are not known since peeled prologue iterations are 3322 not known. Hence guards remain the same. */ 3323 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, 3324 NULL, 0, vect_prologue); 3325 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken, 3326 NULL, 0, vect_prologue); 3327 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, 3328 NULL, 0, vect_epilogue); 3329 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken, 3330 NULL, 0, vect_epilogue); 3331 stmt_info_for_cost *si; 3332 int j; 3333 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si) 3334 { 3335 struct _stmt_vec_info *stmt_info 3336 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL; 3337 (void) add_stmt_cost (target_cost_data, 3338 si->count * peel_iters_prologue, 3339 si->kind, stmt_info, si->misalign, 3340 vect_prologue); 3341 (void) add_stmt_cost (target_cost_data, 3342 si->count * peel_iters_epilogue, 3343 si->kind, stmt_info, si->misalign, 3344 vect_epilogue); 3345 } 3346 } 3347 else 3348 { 3349 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec; 3350 stmt_info_for_cost *si; 3351 int j; 3352 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo); 3353 3354 prologue_cost_vec.create (2); 3355 epilogue_cost_vec.create (2); 3356 peel_iters_prologue = npeel; 3357 3358 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue, 3359 &peel_iters_epilogue, 3360 &LOOP_VINFO_SCALAR_ITERATION_COST 3361 (loop_vinfo), 3362 &prologue_cost_vec, 3363 &epilogue_cost_vec); 3364 3365 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si) 3366 { 3367 struct _stmt_vec_info *stmt_info 3368 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL; 3369 (void) add_stmt_cost (data, si->count, si->kind, stmt_info, 3370 si->misalign, vect_prologue); 3371 } 3372 3373 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si) 3374 { 3375 struct _stmt_vec_info *stmt_info 3376 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL; 3377 (void) add_stmt_cost (data, si->count, si->kind, stmt_info, 3378 si->misalign, vect_epilogue); 3379 } 3380 3381 prologue_cost_vec.release (); 3382 epilogue_cost_vec.release (); 3383 } 3384 3385 /* FORNOW: The scalar outside cost is incremented in one of the 3386 following ways: 3387 3388 1. The vectorizer checks for alignment and aliasing and generates 3389 a condition that allows dynamic vectorization. A cost model 3390 check is ANDED with the versioning condition. Hence scalar code 3391 path now has the added cost of the versioning check. 3392 3393 if (cost > th & versioning_check) 3394 jmp to vector code 3395 3396 Hence run-time scalar is incremented by not-taken branch cost. 3397 3398 2. The vectorizer then checks if a prologue is required. If the 3399 cost model check was not done before during versioning, it has to 3400 be done before the prologue check. 3401 3402 if (cost <= th) 3403 prologue = scalar_iters 3404 if (prologue == 0) 3405 jmp to vector code 3406 else 3407 execute prologue 3408 if (prologue == num_iters) 3409 go to exit 3410 3411 Hence the run-time scalar cost is incremented by a taken branch, 3412 plus a not-taken branch, plus a taken branch cost. 3413 3414 3. The vectorizer then checks if an epilogue is required. If the 3415 cost model check was not done before during prologue check, it 3416 has to be done with the epilogue check. 3417 3418 if (prologue == 0) 3419 jmp to vector code 3420 else 3421 execute prologue 3422 if (prologue == num_iters) 3423 go to exit 3424 vector code: 3425 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0)) 3426 jmp to epilogue 3427 3428 Hence the run-time scalar cost should be incremented by 2 taken 3429 branches. 3430 3431 TODO: The back end may reorder the BBS's differently and reverse 3432 conditions/branch directions. Change the estimates below to 3433 something more reasonable. */ 3434 3435 /* If the number of iterations is known and we do not do versioning, we can 3436 decide whether to vectorize at compile time. Hence the scalar version 3437 do not carry cost model guard costs. */ 3438 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) 3439 || LOOP_REQUIRES_VERSIONING (loop_vinfo)) 3440 { 3441 /* Cost model check occurs at versioning. */ 3442 if (LOOP_REQUIRES_VERSIONING (loop_vinfo)) 3443 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken); 3444 else 3445 { 3446 /* Cost model check occurs at prologue generation. */ 3447 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0) 3448 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken) 3449 + vect_get_stmt_cost (cond_branch_not_taken); 3450 /* Cost model check occurs at epilogue generation. */ 3451 else 3452 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken); 3453 } 3454 } 3455 3456 /* Complete the target-specific cost calculations. */ 3457 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost, 3458 &vec_inside_cost, &vec_epilogue_cost); 3459 3460 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost); 3461 3462 if (dump_enabled_p ()) 3463 { 3464 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n"); 3465 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n", 3466 vec_inside_cost); 3467 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", 3468 vec_prologue_cost); 3469 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", 3470 vec_epilogue_cost); 3471 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n", 3472 scalar_single_iter_cost); 3473 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n", 3474 scalar_outside_cost); 3475 dump_printf (MSG_NOTE, " Vector outside cost: %d\n", 3476 vec_outside_cost); 3477 dump_printf (MSG_NOTE, " prologue iterations: %d\n", 3478 peel_iters_prologue); 3479 dump_printf (MSG_NOTE, " epilogue iterations: %d\n", 3480 peel_iters_epilogue); 3481 } 3482 3483 /* Calculate number of iterations required to make the vector version 3484 profitable, relative to the loop bodies only. The following condition 3485 must hold true: 3486 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC 3487 where 3488 SIC = scalar iteration cost, VIC = vector iteration cost, 3489 VOC = vector outside cost, VF = vectorization factor, 3490 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations 3491 SOC = scalar outside cost for run time cost model check. */ 3492 3493 if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost) 3494 { 3495 if (vec_outside_cost <= 0) 3496 min_profitable_iters = 1; 3497 else 3498 { 3499 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf 3500 - vec_inside_cost * peel_iters_prologue 3501 - vec_inside_cost * peel_iters_epilogue) 3502 / ((scalar_single_iter_cost * vf) 3503 - vec_inside_cost); 3504 3505 if ((scalar_single_iter_cost * vf * min_profitable_iters) 3506 <= (((int) vec_inside_cost * min_profitable_iters) 3507 + (((int) vec_outside_cost - scalar_outside_cost) * vf))) 3508 min_profitable_iters++; 3509 } 3510 } 3511 /* vector version will never be profitable. */ 3512 else 3513 { 3514 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize) 3515 warning_at (vect_location, OPT_Wopenmp_simd, "vectorization " 3516 "did not happen for a simd loop"); 3517 3518 if (dump_enabled_p ()) 3519 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3520 "cost model: the vector iteration cost = %d " 3521 "divided by the scalar iteration cost = %d " 3522 "is greater or equal to the vectorization factor = %d" 3523 ".\n", 3524 vec_inside_cost, scalar_single_iter_cost, vf); 3525 *ret_min_profitable_niters = -1; 3526 *ret_min_profitable_estimate = -1; 3527 return; 3528 } 3529 3530 dump_printf (MSG_NOTE, 3531 " Calculated minimum iters for profitability: %d\n", 3532 min_profitable_iters); 3533 3534 min_profitable_iters = 3535 min_profitable_iters < vf ? vf : min_profitable_iters; 3536 3537 /* Because the condition we create is: 3538 if (niters <= min_profitable_iters) 3539 then skip the vectorized loop. */ 3540 min_profitable_iters--; 3541 3542 if (dump_enabled_p ()) 3543 dump_printf_loc (MSG_NOTE, vect_location, 3544 " Runtime profitability threshold = %d\n", 3545 min_profitable_iters); 3546 3547 *ret_min_profitable_niters = min_profitable_iters; 3548 3549 /* Calculate number of iterations required to make the vector version 3550 profitable, relative to the loop bodies only. 3551 3552 Non-vectorized variant is SIC * niters and it must win over vector 3553 variant on the expected loop trip count. The following condition must hold true: 3554 SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC */ 3555 3556 if (vec_outside_cost <= 0) 3557 min_profitable_estimate = 1; 3558 else 3559 { 3560 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf 3561 - vec_inside_cost * peel_iters_prologue 3562 - vec_inside_cost * peel_iters_epilogue) 3563 / ((scalar_single_iter_cost * vf) 3564 - vec_inside_cost); 3565 } 3566 min_profitable_estimate --; 3567 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters); 3568 if (dump_enabled_p ()) 3569 dump_printf_loc (MSG_NOTE, vect_location, 3570 " Static estimate profitability threshold = %d\n", 3571 min_profitable_estimate); 3572 3573 *ret_min_profitable_estimate = min_profitable_estimate; 3574 } 3575 3576 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET 3577 vector elements (not bits) for a vector of mode MODE. */ 3578 static void 3579 calc_vec_perm_mask_for_shift (enum machine_mode mode, unsigned int offset, 3580 unsigned char *sel) 3581 { 3582 unsigned int i, nelt = GET_MODE_NUNITS (mode); 3583 3584 for (i = 0; i < nelt; i++) 3585 sel[i] = (i + offset) & (2*nelt - 1); 3586 } 3587 3588 /* Checks whether the target supports whole-vector shifts for vectors of mode 3589 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_ 3590 it supports vec_perm_const with masks for all necessary shift amounts. */ 3591 static bool 3592 have_whole_vector_shift (enum machine_mode mode) 3593 { 3594 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing) 3595 return true; 3596 3597 if (direct_optab_handler (vec_perm_const_optab, mode) == CODE_FOR_nothing) 3598 return false; 3599 3600 unsigned int i, nelt = GET_MODE_NUNITS (mode); 3601 unsigned char *sel = XALLOCAVEC (unsigned char, nelt); 3602 3603 for (i = nelt/2; i >= 1; i/=2) 3604 { 3605 calc_vec_perm_mask_for_shift (mode, i, sel); 3606 if (!can_vec_perm_p (mode, false, sel)) 3607 return false; 3608 } 3609 return true; 3610 } 3611 3612 /* Return the reduction operand (with index REDUC_INDEX) of STMT. */ 3613 3614 static tree 3615 get_reduction_op (gimple *stmt, int reduc_index) 3616 { 3617 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt))) 3618 { 3619 case GIMPLE_SINGLE_RHS: 3620 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) 3621 == ternary_op); 3622 return TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index); 3623 case GIMPLE_UNARY_RHS: 3624 return gimple_assign_rhs1 (stmt); 3625 case GIMPLE_BINARY_RHS: 3626 return (reduc_index 3627 ? gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt)); 3628 case GIMPLE_TERNARY_RHS: 3629 return gimple_op (stmt, reduc_index + 1); 3630 default: 3631 gcc_unreachable (); 3632 } 3633 } 3634 3635 /* TODO: Close dependency between vect_model_*_cost and vectorizable_* 3636 functions. Design better to avoid maintenance issues. */ 3637 3638 /* Function vect_model_reduction_cost. 3639 3640 Models cost for a reduction operation, including the vector ops 3641 generated within the strip-mine loop, the initial definition before 3642 the loop, and the epilogue code that must be generated. */ 3643 3644 static bool 3645 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code, 3646 int ncopies, int reduc_index) 3647 { 3648 int prologue_cost = 0, epilogue_cost = 0; 3649 enum tree_code code; 3650 optab optab; 3651 tree vectype; 3652 gimple *stmt, *orig_stmt; 3653 tree reduction_op; 3654 machine_mode mode; 3655 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 3656 struct loop *loop = NULL; 3657 void *target_cost_data; 3658 3659 if (loop_vinfo) 3660 { 3661 loop = LOOP_VINFO_LOOP (loop_vinfo); 3662 target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo); 3663 } 3664 else 3665 target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info)); 3666 3667 /* Condition reductions generate two reductions in the loop. */ 3668 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION) 3669 ncopies *= 2; 3670 3671 /* Cost of reduction op inside loop. */ 3672 unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt, 3673 stmt_info, 0, vect_body); 3674 stmt = STMT_VINFO_STMT (stmt_info); 3675 3676 reduction_op = get_reduction_op (stmt, reduc_index); 3677 3678 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op)); 3679 if (!vectype) 3680 { 3681 if (dump_enabled_p ()) 3682 { 3683 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3684 "unsupported data-type "); 3685 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 3686 TREE_TYPE (reduction_op)); 3687 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 3688 } 3689 return false; 3690 } 3691 3692 mode = TYPE_MODE (vectype); 3693 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info); 3694 3695 if (!orig_stmt) 3696 orig_stmt = STMT_VINFO_STMT (stmt_info); 3697 3698 code = gimple_assign_rhs_code (orig_stmt); 3699 3700 /* Add in cost for initial definition. 3701 For cond reduction we have four vectors: initial index, step, initial 3702 result of the data reduction, initial value of the index reduction. */ 3703 int prologue_stmts = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) 3704 == COND_REDUCTION ? 4 : 1; 3705 prologue_cost += add_stmt_cost (target_cost_data, prologue_stmts, 3706 scalar_to_vec, stmt_info, 0, 3707 vect_prologue); 3708 3709 /* Determine cost of epilogue code. 3710 3711 We have a reduction operator that will reduce the vector in one statement. 3712 Also requires scalar extract. */ 3713 3714 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt)) 3715 { 3716 if (reduc_code != ERROR_MARK) 3717 { 3718 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION) 3719 { 3720 /* An EQ stmt and an COND_EXPR stmt. */ 3721 epilogue_cost += add_stmt_cost (target_cost_data, 2, 3722 vector_stmt, stmt_info, 0, 3723 vect_epilogue); 3724 /* Reduction of the max index and a reduction of the found 3725 values. */ 3726 epilogue_cost += add_stmt_cost (target_cost_data, 2, 3727 vec_to_scalar, stmt_info, 0, 3728 vect_epilogue); 3729 /* A broadcast of the max value. */ 3730 epilogue_cost += add_stmt_cost (target_cost_data, 1, 3731 scalar_to_vec, stmt_info, 0, 3732 vect_epilogue); 3733 } 3734 else 3735 { 3736 epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt, 3737 stmt_info, 0, vect_epilogue); 3738 epilogue_cost += add_stmt_cost (target_cost_data, 1, 3739 vec_to_scalar, stmt_info, 0, 3740 vect_epilogue); 3741 } 3742 } 3743 else 3744 { 3745 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype)); 3746 tree bitsize = 3747 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt))); 3748 int element_bitsize = tree_to_uhwi (bitsize); 3749 int nelements = vec_size_in_bits / element_bitsize; 3750 3751 optab = optab_for_tree_code (code, vectype, optab_default); 3752 3753 /* We have a whole vector shift available. */ 3754 if (VECTOR_MODE_P (mode) 3755 && optab_handler (optab, mode) != CODE_FOR_nothing 3756 && have_whole_vector_shift (mode)) 3757 { 3758 /* Final reduction via vector shifts and the reduction operator. 3759 Also requires scalar extract. */ 3760 epilogue_cost += add_stmt_cost (target_cost_data, 3761 exact_log2 (nelements) * 2, 3762 vector_stmt, stmt_info, 0, 3763 vect_epilogue); 3764 epilogue_cost += add_stmt_cost (target_cost_data, 1, 3765 vec_to_scalar, stmt_info, 0, 3766 vect_epilogue); 3767 } 3768 else 3769 /* Use extracts and reduction op for final reduction. For N 3770 elements, we have N extracts and N-1 reduction ops. */ 3771 epilogue_cost += add_stmt_cost (target_cost_data, 3772 nelements + nelements - 1, 3773 vector_stmt, stmt_info, 0, 3774 vect_epilogue); 3775 } 3776 } 3777 3778 if (dump_enabled_p ()) 3779 dump_printf (MSG_NOTE, 3780 "vect_model_reduction_cost: inside_cost = %d, " 3781 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost, 3782 prologue_cost, epilogue_cost); 3783 3784 return true; 3785 } 3786 3787 3788 /* Function vect_model_induction_cost. 3789 3790 Models cost for induction operations. */ 3791 3792 static void 3793 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies) 3794 { 3795 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 3796 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo); 3797 unsigned inside_cost, prologue_cost; 3798 3799 /* loop cost for vec_loop. */ 3800 inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt, 3801 stmt_info, 0, vect_body); 3802 3803 /* prologue cost for vec_init and vec_step. */ 3804 prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec, 3805 stmt_info, 0, vect_prologue); 3806 3807 if (dump_enabled_p ()) 3808 dump_printf_loc (MSG_NOTE, vect_location, 3809 "vect_model_induction_cost: inside_cost = %d, " 3810 "prologue_cost = %d .\n", inside_cost, prologue_cost); 3811 } 3812 3813 3814 /* Function get_initial_def_for_induction 3815 3816 Input: 3817 STMT - a stmt that performs an induction operation in the loop. 3818 IV_PHI - the initial value of the induction variable 3819 3820 Output: 3821 Return a vector variable, initialized with the first VF values of 3822 the induction variable. E.g., for an iv with IV_PHI='X' and 3823 evolution S, for a vector of 4 units, we want to return: 3824 [X, X + S, X + 2*S, X + 3*S]. */ 3825 3826 static tree 3827 get_initial_def_for_induction (gimple *iv_phi) 3828 { 3829 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi); 3830 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo); 3831 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 3832 tree vectype; 3833 int nunits; 3834 edge pe = loop_preheader_edge (loop); 3835 struct loop *iv_loop; 3836 basic_block new_bb; 3837 tree new_vec, vec_init, vec_step, t; 3838 tree new_name; 3839 gimple *new_stmt; 3840 gphi *induction_phi; 3841 tree induc_def, vec_def, vec_dest; 3842 tree init_expr, step_expr; 3843 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 3844 int i; 3845 int ncopies; 3846 tree expr; 3847 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi); 3848 bool nested_in_vect_loop = false; 3849 gimple_seq stmts; 3850 imm_use_iterator imm_iter; 3851 use_operand_p use_p; 3852 gimple *exit_phi; 3853 edge latch_e; 3854 tree loop_arg; 3855 gimple_stmt_iterator si; 3856 basic_block bb = gimple_bb (iv_phi); 3857 tree stepvectype; 3858 tree resvectype; 3859 3860 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */ 3861 if (nested_in_vect_loop_p (loop, iv_phi)) 3862 { 3863 nested_in_vect_loop = true; 3864 iv_loop = loop->inner; 3865 } 3866 else 3867 iv_loop = loop; 3868 gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father); 3869 3870 latch_e = loop_latch_edge (iv_loop); 3871 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e); 3872 3873 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info); 3874 gcc_assert (step_expr != NULL_TREE); 3875 3876 pe = loop_preheader_edge (iv_loop); 3877 init_expr = PHI_ARG_DEF_FROM_EDGE (iv_phi, 3878 loop_preheader_edge (iv_loop)); 3879 3880 vectype = get_vectype_for_scalar_type (TREE_TYPE (init_expr)); 3881 resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi))); 3882 gcc_assert (vectype); 3883 nunits = TYPE_VECTOR_SUBPARTS (vectype); 3884 ncopies = vf / nunits; 3885 3886 gcc_assert (phi_info); 3887 gcc_assert (ncopies >= 1); 3888 3889 /* Convert the step to the desired type. */ 3890 stmts = NULL; 3891 step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr); 3892 if (stmts) 3893 { 3894 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); 3895 gcc_assert (!new_bb); 3896 } 3897 3898 /* Find the first insertion point in the BB. */ 3899 si = gsi_after_labels (bb); 3900 3901 /* Create the vector that holds the initial_value of the induction. */ 3902 if (nested_in_vect_loop) 3903 { 3904 /* iv_loop is nested in the loop to be vectorized. init_expr had already 3905 been created during vectorization of previous stmts. We obtain it 3906 from the STMT_VINFO_VEC_STMT of the defining stmt. */ 3907 vec_init = vect_get_vec_def_for_operand (init_expr, iv_phi); 3908 /* If the initial value is not of proper type, convert it. */ 3909 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init))) 3910 { 3911 new_stmt 3912 = gimple_build_assign (vect_get_new_ssa_name (vectype, 3913 vect_simple_var, 3914 "vec_iv_"), 3915 VIEW_CONVERT_EXPR, 3916 build1 (VIEW_CONVERT_EXPR, vectype, 3917 vec_init)); 3918 vec_init = gimple_assign_lhs (new_stmt); 3919 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop), 3920 new_stmt); 3921 gcc_assert (!new_bb); 3922 set_vinfo_for_stmt (new_stmt, 3923 new_stmt_vec_info (new_stmt, loop_vinfo)); 3924 } 3925 } 3926 else 3927 { 3928 vec<constructor_elt, va_gc> *v; 3929 3930 /* iv_loop is the loop to be vectorized. Create: 3931 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */ 3932 stmts = NULL; 3933 new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr); 3934 3935 vec_alloc (v, nunits); 3936 bool constant_p = is_gimple_min_invariant (new_name); 3937 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name); 3938 for (i = 1; i < nunits; i++) 3939 { 3940 /* Create: new_name_i = new_name + step_expr */ 3941 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name), 3942 new_name, step_expr); 3943 if (!is_gimple_min_invariant (new_name)) 3944 constant_p = false; 3945 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name); 3946 } 3947 if (stmts) 3948 { 3949 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); 3950 gcc_assert (!new_bb); 3951 } 3952 3953 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */ 3954 if (constant_p) 3955 new_vec = build_vector_from_ctor (vectype, v); 3956 else 3957 new_vec = build_constructor (vectype, v); 3958 vec_init = vect_init_vector (iv_phi, new_vec, vectype, NULL); 3959 } 3960 3961 3962 /* Create the vector that holds the step of the induction. */ 3963 if (nested_in_vect_loop) 3964 /* iv_loop is nested in the loop to be vectorized. Generate: 3965 vec_step = [S, S, S, S] */ 3966 new_name = step_expr; 3967 else 3968 { 3969 /* iv_loop is the loop to be vectorized. Generate: 3970 vec_step = [VF*S, VF*S, VF*S, VF*S] */ 3971 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))) 3972 { 3973 expr = build_int_cst (integer_type_node, vf); 3974 expr = fold_convert (TREE_TYPE (step_expr), expr); 3975 } 3976 else 3977 expr = build_int_cst (TREE_TYPE (step_expr), vf); 3978 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr), 3979 expr, step_expr); 3980 if (TREE_CODE (step_expr) == SSA_NAME) 3981 new_name = vect_init_vector (iv_phi, new_name, 3982 TREE_TYPE (step_expr), NULL); 3983 } 3984 3985 t = unshare_expr (new_name); 3986 gcc_assert (CONSTANT_CLASS_P (new_name) 3987 || TREE_CODE (new_name) == SSA_NAME); 3988 stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name)); 3989 gcc_assert (stepvectype); 3990 new_vec = build_vector_from_val (stepvectype, t); 3991 vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL); 3992 3993 3994 /* Create the following def-use cycle: 3995 loop prolog: 3996 vec_init = ... 3997 vec_step = ... 3998 loop: 3999 vec_iv = PHI <vec_init, vec_loop> 4000 ... 4001 STMT 4002 ... 4003 vec_loop = vec_iv + vec_step; */ 4004 4005 /* Create the induction-phi that defines the induction-operand. */ 4006 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_"); 4007 induction_phi = create_phi_node (vec_dest, iv_loop->header); 4008 set_vinfo_for_stmt (induction_phi, 4009 new_stmt_vec_info (induction_phi, loop_vinfo)); 4010 induc_def = PHI_RESULT (induction_phi); 4011 4012 /* Create the iv update inside the loop */ 4013 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR, induc_def, vec_step); 4014 vec_def = make_ssa_name (vec_dest, new_stmt); 4015 gimple_assign_set_lhs (new_stmt, vec_def); 4016 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT); 4017 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo)); 4018 4019 /* Set the arguments of the phi node: */ 4020 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION); 4021 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop), 4022 UNKNOWN_LOCATION); 4023 4024 4025 /* In case that vectorization factor (VF) is bigger than the number 4026 of elements that we can fit in a vectype (nunits), we have to generate 4027 more than one vector stmt - i.e - we need to "unroll" the 4028 vector stmt by a factor VF/nunits. For more details see documentation 4029 in vectorizable_operation. */ 4030 4031 if (ncopies > 1) 4032 { 4033 stmt_vec_info prev_stmt_vinfo; 4034 /* FORNOW. This restriction should be relaxed. */ 4035 gcc_assert (!nested_in_vect_loop); 4036 4037 /* Create the vector that holds the step of the induction. */ 4038 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))) 4039 { 4040 expr = build_int_cst (integer_type_node, nunits); 4041 expr = fold_convert (TREE_TYPE (step_expr), expr); 4042 } 4043 else 4044 expr = build_int_cst (TREE_TYPE (step_expr), nunits); 4045 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr), 4046 expr, step_expr); 4047 if (TREE_CODE (step_expr) == SSA_NAME) 4048 new_name = vect_init_vector (iv_phi, new_name, 4049 TREE_TYPE (step_expr), NULL); 4050 t = unshare_expr (new_name); 4051 gcc_assert (CONSTANT_CLASS_P (new_name) 4052 || TREE_CODE (new_name) == SSA_NAME); 4053 new_vec = build_vector_from_val (stepvectype, t); 4054 vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL); 4055 4056 vec_def = induc_def; 4057 prev_stmt_vinfo = vinfo_for_stmt (induction_phi); 4058 for (i = 1; i < ncopies; i++) 4059 { 4060 /* vec_i = vec_prev + vec_step */ 4061 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR, 4062 vec_def, vec_step); 4063 vec_def = make_ssa_name (vec_dest, new_stmt); 4064 gimple_assign_set_lhs (new_stmt, vec_def); 4065 4066 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT); 4067 if (!useless_type_conversion_p (resvectype, vectype)) 4068 { 4069 new_stmt 4070 = gimple_build_assign 4071 (vect_get_new_vect_var (resvectype, vect_simple_var, 4072 "vec_iv_"), 4073 VIEW_CONVERT_EXPR, 4074 build1 (VIEW_CONVERT_EXPR, resvectype, 4075 gimple_assign_lhs (new_stmt))); 4076 gimple_assign_set_lhs (new_stmt, 4077 make_ssa_name 4078 (gimple_assign_lhs (new_stmt), new_stmt)); 4079 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT); 4080 } 4081 set_vinfo_for_stmt (new_stmt, 4082 new_stmt_vec_info (new_stmt, loop_vinfo)); 4083 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt; 4084 prev_stmt_vinfo = vinfo_for_stmt (new_stmt); 4085 } 4086 } 4087 4088 if (nested_in_vect_loop) 4089 { 4090 /* Find the loop-closed exit-phi of the induction, and record 4091 the final vector of induction results: */ 4092 exit_phi = NULL; 4093 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg) 4094 { 4095 gimple *use_stmt = USE_STMT (use_p); 4096 if (is_gimple_debug (use_stmt)) 4097 continue; 4098 4099 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt))) 4100 { 4101 exit_phi = use_stmt; 4102 break; 4103 } 4104 } 4105 if (exit_phi) 4106 { 4107 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi); 4108 /* FORNOW. Currently not supporting the case that an inner-loop induction 4109 is not used in the outer-loop (i.e. only outside the outer-loop). */ 4110 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo) 4111 && !STMT_VINFO_LIVE_P (stmt_vinfo)); 4112 4113 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt; 4114 if (dump_enabled_p ()) 4115 { 4116 dump_printf_loc (MSG_NOTE, vect_location, 4117 "vector of inductions after inner-loop:"); 4118 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0); 4119 } 4120 } 4121 } 4122 4123 4124 if (dump_enabled_p ()) 4125 { 4126 dump_printf_loc (MSG_NOTE, vect_location, 4127 "transform induction: created def-use cycle: "); 4128 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0); 4129 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, 4130 SSA_NAME_DEF_STMT (vec_def), 0); 4131 } 4132 4133 STMT_VINFO_VEC_STMT (phi_info) = induction_phi; 4134 if (!useless_type_conversion_p (resvectype, vectype)) 4135 { 4136 new_stmt = gimple_build_assign (vect_get_new_vect_var (resvectype, 4137 vect_simple_var, 4138 "vec_iv_"), 4139 VIEW_CONVERT_EXPR, 4140 build1 (VIEW_CONVERT_EXPR, resvectype, 4141 induc_def)); 4142 induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt); 4143 gimple_assign_set_lhs (new_stmt, induc_def); 4144 si = gsi_after_labels (bb); 4145 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT); 4146 set_vinfo_for_stmt (new_stmt, 4147 new_stmt_vec_info (new_stmt, loop_vinfo)); 4148 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt)) 4149 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi)); 4150 } 4151 4152 return induc_def; 4153 } 4154 4155 4156 /* Function get_initial_def_for_reduction 4157 4158 Input: 4159 STMT - a stmt that performs a reduction operation in the loop. 4160 INIT_VAL - the initial value of the reduction variable 4161 4162 Output: 4163 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result 4164 of the reduction (used for adjusting the epilog - see below). 4165 Return a vector variable, initialized according to the operation that STMT 4166 performs. This vector will be used as the initial value of the 4167 vector of partial results. 4168 4169 Option1 (adjust in epilog): Initialize the vector as follows: 4170 add/bit or/xor: [0,0,...,0,0] 4171 mult/bit and: [1,1,...,1,1] 4172 min/max/cond_expr: [init_val,init_val,..,init_val,init_val] 4173 and when necessary (e.g. add/mult case) let the caller know 4174 that it needs to adjust the result by init_val. 4175 4176 Option2: Initialize the vector as follows: 4177 add/bit or/xor: [init_val,0,0,...,0] 4178 mult/bit and: [init_val,1,1,...,1] 4179 min/max/cond_expr: [init_val,init_val,...,init_val] 4180 and no adjustments are needed. 4181 4182 For example, for the following code: 4183 4184 s = init_val; 4185 for (i=0;i<n;i++) 4186 s = s + a[i]; 4187 4188 STMT is 's = s + a[i]', and the reduction variable is 's'. 4189 For a vector of 4 units, we want to return either [0,0,0,init_val], 4190 or [0,0,0,0] and let the caller know that it needs to adjust 4191 the result at the end by 'init_val'. 4192 4193 FORNOW, we are using the 'adjust in epilog' scheme, because this way the 4194 initialization vector is simpler (same element in all entries), if 4195 ADJUSTMENT_DEF is not NULL, and Option2 otherwise. 4196 4197 A cost model should help decide between these two schemes. */ 4198 4199 tree 4200 get_initial_def_for_reduction (gimple *stmt, tree init_val, 4201 tree *adjustment_def) 4202 { 4203 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt); 4204 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo); 4205 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 4206 tree scalar_type = TREE_TYPE (init_val); 4207 tree vectype = get_vectype_for_scalar_type (scalar_type); 4208 int nunits; 4209 enum tree_code code = gimple_assign_rhs_code (stmt); 4210 tree def_for_init; 4211 tree init_def; 4212 tree *elts; 4213 int i; 4214 bool nested_in_vect_loop = false; 4215 REAL_VALUE_TYPE real_init_val = dconst0; 4216 int int_init_val = 0; 4217 gimple *def_stmt = NULL; 4218 gimple_seq stmts = NULL; 4219 4220 gcc_assert (vectype); 4221 nunits = TYPE_VECTOR_SUBPARTS (vectype); 4222 4223 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type) 4224 || SCALAR_FLOAT_TYPE_P (scalar_type)); 4225 4226 if (nested_in_vect_loop_p (loop, stmt)) 4227 nested_in_vect_loop = true; 4228 else 4229 gcc_assert (loop == (gimple_bb (stmt))->loop_father); 4230 4231 /* In case of double reduction we only create a vector variable to be put 4232 in the reduction phi node. The actual statement creation is done in 4233 vect_create_epilog_for_reduction. */ 4234 if (adjustment_def && nested_in_vect_loop 4235 && TREE_CODE (init_val) == SSA_NAME 4236 && (def_stmt = SSA_NAME_DEF_STMT (init_val)) 4237 && gimple_code (def_stmt) == GIMPLE_PHI 4238 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)) 4239 && vinfo_for_stmt (def_stmt) 4240 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt)) 4241 == vect_double_reduction_def) 4242 { 4243 *adjustment_def = NULL; 4244 return vect_create_destination_var (init_val, vectype); 4245 } 4246 4247 /* In case of a nested reduction do not use an adjustment def as 4248 that case is not supported by the epilogue generation correctly 4249 if ncopies is not one. */ 4250 if (adjustment_def && nested_in_vect_loop) 4251 { 4252 *adjustment_def = NULL; 4253 return vect_get_vec_def_for_operand (init_val, stmt); 4254 } 4255 4256 switch (code) 4257 { 4258 case WIDEN_SUM_EXPR: 4259 case DOT_PROD_EXPR: 4260 case SAD_EXPR: 4261 case PLUS_EXPR: 4262 case MINUS_EXPR: 4263 case BIT_IOR_EXPR: 4264 case BIT_XOR_EXPR: 4265 case MULT_EXPR: 4266 case BIT_AND_EXPR: 4267 /* ADJUSTMENT_DEF is NULL when called from 4268 vect_create_epilog_for_reduction to vectorize double reduction. */ 4269 if (adjustment_def) 4270 *adjustment_def = init_val; 4271 4272 if (code == MULT_EXPR) 4273 { 4274 real_init_val = dconst1; 4275 int_init_val = 1; 4276 } 4277 4278 if (code == BIT_AND_EXPR) 4279 int_init_val = -1; 4280 4281 if (SCALAR_FLOAT_TYPE_P (scalar_type)) 4282 def_for_init = build_real (scalar_type, real_init_val); 4283 else 4284 def_for_init = build_int_cst (scalar_type, int_init_val); 4285 4286 /* Create a vector of '0' or '1' except the first element. */ 4287 elts = XALLOCAVEC (tree, nunits); 4288 for (i = nunits - 2; i >= 0; --i) 4289 elts[i + 1] = def_for_init; 4290 4291 /* Option1: the first element is '0' or '1' as well. */ 4292 if (adjustment_def) 4293 { 4294 elts[0] = def_for_init; 4295 init_def = build_vector (vectype, elts); 4296 break; 4297 } 4298 4299 /* Option2: the first element is INIT_VAL. */ 4300 elts[0] = init_val; 4301 if (TREE_CONSTANT (init_val)) 4302 init_def = build_vector (vectype, elts); 4303 else 4304 { 4305 vec<constructor_elt, va_gc> *v; 4306 vec_alloc (v, nunits); 4307 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val); 4308 for (i = 1; i < nunits; ++i) 4309 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]); 4310 init_def = build_constructor (vectype, v); 4311 } 4312 4313 break; 4314 4315 case MIN_EXPR: 4316 case MAX_EXPR: 4317 case COND_EXPR: 4318 if (adjustment_def) 4319 { 4320 *adjustment_def = NULL_TREE; 4321 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo) != COND_REDUCTION) 4322 { 4323 init_def = vect_get_vec_def_for_operand (init_val, stmt); 4324 break; 4325 } 4326 } 4327 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val); 4328 if (! gimple_seq_empty_p (stmts)) 4329 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts); 4330 init_def = build_vector_from_val (vectype, init_val); 4331 break; 4332 4333 default: 4334 gcc_unreachable (); 4335 } 4336 4337 return init_def; 4338 } 4339 4340 /* Function vect_create_epilog_for_reduction 4341 4342 Create code at the loop-epilog to finalize the result of a reduction 4343 computation. 4344 4345 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector 4346 reduction statements. 4347 STMT is the scalar reduction stmt that is being vectorized. 4348 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the 4349 number of elements that we can fit in a vectype (nunits). In this case 4350 we have to generate more than one vector stmt - i.e - we need to "unroll" 4351 the vector stmt by a factor VF/nunits. For more details see documentation 4352 in vectorizable_operation. 4353 REDUC_CODE is the tree-code for the epilog reduction. 4354 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction 4355 computation. 4356 REDUC_INDEX is the index of the operand in the right hand side of the 4357 statement that is defined by REDUCTION_PHI. 4358 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled. 4359 SLP_NODE is an SLP node containing a group of reduction statements. The 4360 first one in this group is STMT. 4361 INDUCTION_INDEX is the index of the loop for condition reductions. 4362 Otherwise it is undefined. 4363 INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case 4364 when the COND_EXPR is never true in the loop. It needs to 4365 be smaller than any value of the IV in the loop. 4366 4367 This function: 4368 1. Creates the reduction def-use cycles: sets the arguments for 4369 REDUCTION_PHIS: 4370 The loop-entry argument is the vectorized initial-value of the reduction. 4371 The loop-latch argument is taken from VECT_DEFS - the vector of partial 4372 sums. 4373 2. "Reduces" each vector of partial results VECT_DEFS into a single result, 4374 by applying the operation specified by REDUC_CODE if available, or by 4375 other means (whole-vector shifts or a scalar loop). 4376 The function also creates a new phi node at the loop exit to preserve 4377 loop-closed form, as illustrated below. 4378 4379 The flow at the entry to this function: 4380 4381 loop: 4382 vec_def = phi <null, null> # REDUCTION_PHI 4383 VECT_DEF = vector_stmt # vectorized form of STMT 4384 s_loop = scalar_stmt # (scalar) STMT 4385 loop_exit: 4386 s_out0 = phi <s_loop> # (scalar) EXIT_PHI 4387 use <s_out0> 4388 use <s_out0> 4389 4390 The above is transformed by this function into: 4391 4392 loop: 4393 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI 4394 VECT_DEF = vector_stmt # vectorized form of STMT 4395 s_loop = scalar_stmt # (scalar) STMT 4396 loop_exit: 4397 s_out0 = phi <s_loop> # (scalar) EXIT_PHI 4398 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI 4399 v_out2 = reduce <v_out1> 4400 s_out3 = extract_field <v_out2, 0> 4401 s_out4 = adjust_result <s_out3> 4402 use <s_out4> 4403 use <s_out4> 4404 */ 4405 4406 static void 4407 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt, 4408 int ncopies, enum tree_code reduc_code, 4409 vec<gimple *> reduction_phis, 4410 int reduc_index, bool double_reduc, 4411 slp_tree slp_node, tree induction_index, 4412 tree induc_val) 4413 { 4414 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 4415 stmt_vec_info prev_phi_info; 4416 tree vectype; 4417 machine_mode mode; 4418 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 4419 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL; 4420 basic_block exit_bb; 4421 tree scalar_dest; 4422 tree scalar_type; 4423 gimple *new_phi = NULL, *phi; 4424 gimple_stmt_iterator exit_gsi; 4425 tree vec_dest; 4426 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest; 4427 gimple *epilog_stmt = NULL; 4428 enum tree_code code = gimple_assign_rhs_code (stmt); 4429 gimple *exit_phi; 4430 tree bitsize; 4431 tree adjustment_def = NULL; 4432 tree vec_initial_def = NULL; 4433 tree reduction_op, expr, def, initial_def = NULL; 4434 tree orig_name, scalar_result; 4435 imm_use_iterator imm_iter, phi_imm_iter; 4436 use_operand_p use_p, phi_use_p; 4437 gimple *use_stmt, *orig_stmt, *reduction_phi = NULL; 4438 bool nested_in_vect_loop = false; 4439 auto_vec<gimple *> new_phis; 4440 auto_vec<gimple *> inner_phis; 4441 enum vect_def_type dt = vect_unknown_def_type; 4442 int j, i; 4443 auto_vec<tree> scalar_results; 4444 unsigned int group_size = 1, k, ratio; 4445 auto_vec<tree> vec_initial_defs; 4446 auto_vec<gimple *> phis; 4447 bool slp_reduc = false; 4448 tree new_phi_result; 4449 gimple *inner_phi = NULL; 4450 4451 if (slp_node) 4452 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length (); 4453 4454 if (nested_in_vect_loop_p (loop, stmt)) 4455 { 4456 outer_loop = loop; 4457 loop = loop->inner; 4458 nested_in_vect_loop = true; 4459 gcc_assert (!slp_node); 4460 } 4461 4462 reduction_op = get_reduction_op (stmt, reduc_index); 4463 4464 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op)); 4465 gcc_assert (vectype); 4466 mode = TYPE_MODE (vectype); 4467 4468 /* 1. Create the reduction def-use cycle: 4469 Set the arguments of REDUCTION_PHIS, i.e., transform 4470 4471 loop: 4472 vec_def = phi <null, null> # REDUCTION_PHI 4473 VECT_DEF = vector_stmt # vectorized form of STMT 4474 ... 4475 4476 into: 4477 4478 loop: 4479 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI 4480 VECT_DEF = vector_stmt # vectorized form of STMT 4481 ... 4482 4483 (in case of SLP, do it for all the phis). */ 4484 4485 /* Get the loop-entry arguments. */ 4486 enum vect_def_type initial_def_dt = vect_unknown_def_type; 4487 if (slp_node) 4488 vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs, 4489 NULL, slp_node, reduc_index); 4490 else 4491 { 4492 /* Get at the scalar def before the loop, that defines the initial value 4493 of the reduction variable. */ 4494 gimple *def_stmt = SSA_NAME_DEF_STMT (reduction_op); 4495 initial_def = PHI_ARG_DEF_FROM_EDGE (def_stmt, 4496 loop_preheader_edge (loop)); 4497 /* Optimize: if initial_def is for REDUC_MAX smaller than the base 4498 and we can't use zero for induc_val, use initial_def. Similarly 4499 for REDUC_MIN and initial_def larger than the base. */ 4500 if (TREE_CODE (initial_def) == INTEGER_CST 4501 && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) 4502 == INTEGER_INDUC_COND_REDUCTION) 4503 && !integer_zerop (induc_val) 4504 && tree_int_cst_lt (initial_def, induc_val)) 4505 induc_val = initial_def; 4506 vect_is_simple_use (initial_def, loop_vinfo, &def_stmt, &initial_def_dt); 4507 vec_initial_def = get_initial_def_for_reduction (stmt, initial_def, 4508 &adjustment_def); 4509 vec_initial_defs.create (1); 4510 vec_initial_defs.quick_push (vec_initial_def); 4511 } 4512 4513 /* Set phi nodes arguments. */ 4514 FOR_EACH_VEC_ELT (reduction_phis, i, phi) 4515 { 4516 tree vec_init_def, def; 4517 gimple_seq stmts; 4518 vec_init_def = force_gimple_operand (vec_initial_defs[i], &stmts, 4519 true, NULL_TREE); 4520 if (stmts) 4521 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts); 4522 4523 def = vect_defs[i]; 4524 for (j = 0; j < ncopies; j++) 4525 { 4526 if (j != 0) 4527 { 4528 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)); 4529 if (nested_in_vect_loop) 4530 vec_init_def 4531 = vect_get_vec_def_for_stmt_copy (initial_def_dt, 4532 vec_init_def); 4533 } 4534 4535 /* Set the loop-entry arg of the reduction-phi. */ 4536 4537 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) 4538 == INTEGER_INDUC_COND_REDUCTION) 4539 { 4540 /* Initialise the reduction phi to zero. This prevents initial 4541 values of non-zero interferring with the reduction op. */ 4542 gcc_assert (ncopies == 1); 4543 gcc_assert (i == 0); 4544 4545 tree vec_init_def_type = TREE_TYPE (vec_init_def); 4546 tree induc_val_vec 4547 = build_vector_from_val (vec_init_def_type, induc_val); 4548 4549 add_phi_arg (as_a <gphi *> (phi), induc_val_vec, 4550 loop_preheader_edge (loop), UNKNOWN_LOCATION); 4551 } 4552 else 4553 add_phi_arg (as_a <gphi *> (phi), vec_init_def, 4554 loop_preheader_edge (loop), UNKNOWN_LOCATION); 4555 4556 /* Set the loop-latch arg for the reduction-phi. */ 4557 if (j > 0) 4558 def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def); 4559 4560 add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop), 4561 UNKNOWN_LOCATION); 4562 4563 if (dump_enabled_p ()) 4564 { 4565 dump_printf_loc (MSG_NOTE, vect_location, 4566 "transform reduction: created def-use cycle: "); 4567 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0); 4568 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0); 4569 } 4570 } 4571 } 4572 4573 /* 2. Create epilog code. 4574 The reduction epilog code operates across the elements of the vector 4575 of partial results computed by the vectorized loop. 4576 The reduction epilog code consists of: 4577 4578 step 1: compute the scalar result in a vector (v_out2) 4579 step 2: extract the scalar result (s_out3) from the vector (v_out2) 4580 step 3: adjust the scalar result (s_out3) if needed. 4581 4582 Step 1 can be accomplished using one the following three schemes: 4583 (scheme 1) using reduc_code, if available. 4584 (scheme 2) using whole-vector shifts, if available. 4585 (scheme 3) using a scalar loop. In this case steps 1+2 above are 4586 combined. 4587 4588 The overall epilog code looks like this: 4589 4590 s_out0 = phi <s_loop> # original EXIT_PHI 4591 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI 4592 v_out2 = reduce <v_out1> # step 1 4593 s_out3 = extract_field <v_out2, 0> # step 2 4594 s_out4 = adjust_result <s_out3> # step 3 4595 4596 (step 3 is optional, and steps 1 and 2 may be combined). 4597 Lastly, the uses of s_out0 are replaced by s_out4. */ 4598 4599 4600 /* 2.1 Create new loop-exit-phis to preserve loop-closed form: 4601 v_out1 = phi <VECT_DEF> 4602 Store them in NEW_PHIS. */ 4603 4604 exit_bb = single_exit (loop)->dest; 4605 prev_phi_info = NULL; 4606 new_phis.create (vect_defs.length ()); 4607 FOR_EACH_VEC_ELT (vect_defs, i, def) 4608 { 4609 for (j = 0; j < ncopies; j++) 4610 { 4611 tree new_def = copy_ssa_name (def); 4612 phi = create_phi_node (new_def, exit_bb); 4613 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo)); 4614 if (j == 0) 4615 new_phis.quick_push (phi); 4616 else 4617 { 4618 def = vect_get_vec_def_for_stmt_copy (dt, def); 4619 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi; 4620 } 4621 4622 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def); 4623 prev_phi_info = vinfo_for_stmt (phi); 4624 } 4625 } 4626 4627 /* The epilogue is created for the outer-loop, i.e., for the loop being 4628 vectorized. Create exit phis for the outer loop. */ 4629 if (double_reduc) 4630 { 4631 loop = outer_loop; 4632 exit_bb = single_exit (loop)->dest; 4633 inner_phis.create (vect_defs.length ()); 4634 FOR_EACH_VEC_ELT (new_phis, i, phi) 4635 { 4636 tree new_result = copy_ssa_name (PHI_RESULT (phi)); 4637 gphi *outer_phi = create_phi_node (new_result, exit_bb); 4638 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx, 4639 PHI_RESULT (phi)); 4640 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi, 4641 loop_vinfo)); 4642 inner_phis.quick_push (phi); 4643 new_phis[i] = outer_phi; 4644 prev_phi_info = vinfo_for_stmt (outer_phi); 4645 while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi))) 4646 { 4647 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)); 4648 new_result = copy_ssa_name (PHI_RESULT (phi)); 4649 outer_phi = create_phi_node (new_result, exit_bb); 4650 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx, 4651 PHI_RESULT (phi)); 4652 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi, 4653 loop_vinfo)); 4654 STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi; 4655 prev_phi_info = vinfo_for_stmt (outer_phi); 4656 } 4657 } 4658 } 4659 4660 exit_gsi = gsi_after_labels (exit_bb); 4661 4662 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3 4663 (i.e. when reduc_code is not available) and in the final adjustment 4664 code (if needed). Also get the original scalar reduction variable as 4665 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it 4666 represents a reduction pattern), the tree-code and scalar-def are 4667 taken from the original stmt that the pattern-stmt (STMT) replaces. 4668 Otherwise (it is a regular reduction) - the tree-code and scalar-def 4669 are taken from STMT. */ 4670 4671 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info); 4672 if (!orig_stmt) 4673 { 4674 /* Regular reduction */ 4675 orig_stmt = stmt; 4676 } 4677 else 4678 { 4679 /* Reduction pattern */ 4680 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt); 4681 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo)); 4682 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt); 4683 } 4684 4685 code = gimple_assign_rhs_code (orig_stmt); 4686 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore, 4687 partial results are added and not subtracted. */ 4688 if (code == MINUS_EXPR) 4689 code = PLUS_EXPR; 4690 4691 scalar_dest = gimple_assign_lhs (orig_stmt); 4692 scalar_type = TREE_TYPE (scalar_dest); 4693 scalar_results.create (group_size); 4694 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL); 4695 bitsize = TYPE_SIZE (scalar_type); 4696 4697 /* In case this is a reduction in an inner-loop while vectorizing an outer 4698 loop - we don't need to extract a single scalar result at the end of the 4699 inner-loop (unless it is double reduction, i.e., the use of reduction is 4700 outside the outer-loop). The final vector of partial results will be used 4701 in the vectorized outer-loop, or reduced to a scalar result at the end of 4702 the outer-loop. */ 4703 if (nested_in_vect_loop && !double_reduc) 4704 goto vect_finalize_reduction; 4705 4706 /* SLP reduction without reduction chain, e.g., 4707 # a1 = phi <a2, a0> 4708 # b1 = phi <b2, b0> 4709 a2 = operation (a1) 4710 b2 = operation (b1) */ 4711 slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))); 4712 4713 /* In case of reduction chain, e.g., 4714 # a1 = phi <a3, a0> 4715 a2 = operation (a1) 4716 a3 = operation (a2), 4717 4718 we may end up with more than one vector result. Here we reduce them to 4719 one vector. */ 4720 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))) 4721 { 4722 tree first_vect = PHI_RESULT (new_phis[0]); 4723 tree tmp; 4724 gassign *new_vec_stmt = NULL; 4725 4726 vec_dest = vect_create_destination_var (scalar_dest, vectype); 4727 for (k = 1; k < new_phis.length (); k++) 4728 { 4729 gimple *next_phi = new_phis[k]; 4730 tree second_vect = PHI_RESULT (next_phi); 4731 4732 tmp = build2 (code, vectype, first_vect, second_vect); 4733 new_vec_stmt = gimple_build_assign (vec_dest, tmp); 4734 first_vect = make_ssa_name (vec_dest, new_vec_stmt); 4735 gimple_assign_set_lhs (new_vec_stmt, first_vect); 4736 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT); 4737 } 4738 4739 new_phi_result = first_vect; 4740 if (new_vec_stmt) 4741 { 4742 new_phis.truncate (0); 4743 new_phis.safe_push (new_vec_stmt); 4744 } 4745 } 4746 else 4747 new_phi_result = PHI_RESULT (new_phis[0]); 4748 4749 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION) 4750 { 4751 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing 4752 various data values where the condition matched and another vector 4753 (INDUCTION_INDEX) containing all the indexes of those matches. We 4754 need to extract the last matching index (which will be the index with 4755 highest value) and use this to index into the data vector. 4756 For the case where there were no matches, the data vector will contain 4757 all default values and the index vector will be all zeros. */ 4758 4759 /* Get various versions of the type of the vector of indexes. */ 4760 tree index_vec_type = TREE_TYPE (induction_index); 4761 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type)); 4762 tree index_scalar_type = TREE_TYPE (index_vec_type); 4763 tree index_vec_cmp_type = build_same_sized_truth_vector_type 4764 (index_vec_type); 4765 4766 /* Get an unsigned integer version of the type of the data vector. */ 4767 int scalar_precision = GET_MODE_PRECISION (TYPE_MODE (scalar_type)); 4768 tree scalar_type_unsigned = make_unsigned_type (scalar_precision); 4769 tree vectype_unsigned = build_vector_type 4770 (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype)); 4771 4772 /* First we need to create a vector (ZERO_VEC) of zeros and another 4773 vector (MAX_INDEX_VEC) filled with the last matching index, which we 4774 can create using a MAX reduction and then expanding. 4775 In the case where the loop never made any matches, the max index will 4776 be zero. */ 4777 4778 /* Vector of {0, 0, 0,...}. */ 4779 tree zero_vec = make_ssa_name (vectype); 4780 tree zero_vec_rhs = build_zero_cst (vectype); 4781 gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs); 4782 gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT); 4783 4784 /* Find maximum value from the vector of found indexes. */ 4785 tree max_index = make_ssa_name (index_scalar_type); 4786 gimple *max_index_stmt = gimple_build_assign (max_index, REDUC_MAX_EXPR, 4787 induction_index); 4788 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT); 4789 4790 /* Vector of {max_index, max_index, max_index,...}. */ 4791 tree max_index_vec = make_ssa_name (index_vec_type); 4792 tree max_index_vec_rhs = build_vector_from_val (index_vec_type, 4793 max_index); 4794 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec, 4795 max_index_vec_rhs); 4796 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT); 4797 4798 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes 4799 with the vector (INDUCTION_INDEX) of found indexes, choosing values 4800 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC) 4801 otherwise. Only one value should match, resulting in a vector 4802 (VEC_COND) with one data value and the rest zeros. 4803 In the case where the loop never made any matches, every index will 4804 match, resulting in a vector with all data values (which will all be 4805 the default value). */ 4806 4807 /* Compare the max index vector to the vector of found indexes to find 4808 the position of the max value. */ 4809 tree vec_compare = make_ssa_name (index_vec_cmp_type); 4810 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR, 4811 induction_index, 4812 max_index_vec); 4813 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT); 4814 4815 /* Use the compare to choose either values from the data vector or 4816 zero. */ 4817 tree vec_cond = make_ssa_name (vectype); 4818 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR, 4819 vec_compare, new_phi_result, 4820 zero_vec); 4821 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT); 4822 4823 /* Finally we need to extract the data value from the vector (VEC_COND) 4824 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR 4825 reduction, but because this doesn't exist, we can use a MAX reduction 4826 instead. The data value might be signed or a float so we need to cast 4827 it first. 4828 In the case where the loop never made any matches, the data values are 4829 all identical, and so will reduce down correctly. */ 4830 4831 /* Make the matched data values unsigned. */ 4832 tree vec_cond_cast = make_ssa_name (vectype_unsigned); 4833 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned, 4834 vec_cond); 4835 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast, 4836 VIEW_CONVERT_EXPR, 4837 vec_cond_cast_rhs); 4838 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT); 4839 4840 /* Reduce down to a scalar value. */ 4841 tree data_reduc = make_ssa_name (scalar_type_unsigned); 4842 optab ot = optab_for_tree_code (REDUC_MAX_EXPR, vectype_unsigned, 4843 optab_default); 4844 gcc_assert (optab_handler (ot, TYPE_MODE (vectype_unsigned)) 4845 != CODE_FOR_nothing); 4846 gimple *data_reduc_stmt = gimple_build_assign (data_reduc, 4847 REDUC_MAX_EXPR, 4848 vec_cond_cast); 4849 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT); 4850 4851 /* Convert the reduced value back to the result type and set as the 4852 result. */ 4853 tree data_reduc_cast = build1 (VIEW_CONVERT_EXPR, scalar_type, 4854 data_reduc); 4855 epilog_stmt = gimple_build_assign (new_scalar_dest, data_reduc_cast); 4856 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); 4857 gimple_assign_set_lhs (epilog_stmt, new_temp); 4858 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 4859 scalar_results.safe_push (new_temp); 4860 } 4861 4862 /* 2.3 Create the reduction code, using one of the three schemes described 4863 above. In SLP we simply need to extract all the elements from the 4864 vector (without reducing them), so we use scalar shifts. */ 4865 else if (reduc_code != ERROR_MARK && !slp_reduc) 4866 { 4867 tree tmp; 4868 tree vec_elem_type; 4869 4870 /*** Case 1: Create: 4871 v_out2 = reduc_expr <v_out1> */ 4872 4873 if (dump_enabled_p ()) 4874 dump_printf_loc (MSG_NOTE, vect_location, 4875 "Reduce using direct vector reduction.\n"); 4876 4877 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result)); 4878 if (!useless_type_conversion_p (scalar_type, vec_elem_type)) 4879 { 4880 tree tmp_dest = 4881 vect_create_destination_var (scalar_dest, vec_elem_type); 4882 tmp = build1 (reduc_code, vec_elem_type, new_phi_result); 4883 epilog_stmt = gimple_build_assign (tmp_dest, tmp); 4884 new_temp = make_ssa_name (tmp_dest, epilog_stmt); 4885 gimple_assign_set_lhs (epilog_stmt, new_temp); 4886 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 4887 4888 tmp = build1 (NOP_EXPR, scalar_type, new_temp); 4889 } 4890 else 4891 tmp = build1 (reduc_code, scalar_type, new_phi_result); 4892 4893 epilog_stmt = gimple_build_assign (new_scalar_dest, tmp); 4894 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); 4895 gimple_assign_set_lhs (epilog_stmt, new_temp); 4896 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 4897 4898 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) 4899 == INTEGER_INDUC_COND_REDUCTION) 4900 && !operand_equal_p (initial_def, induc_val, 0)) 4901 { 4902 /* Earlier we set the initial value to be a vector if induc_val 4903 values. Check the result and if it is induc_val then replace 4904 with the original initial value, unless induc_val is 4905 the same as initial_def already. */ 4906 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp, 4907 induc_val); 4908 4909 tmp = make_ssa_name (new_scalar_dest); 4910 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare, 4911 initial_def, new_temp); 4912 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 4913 new_temp = tmp; 4914 } 4915 4916 scalar_results.safe_push (new_temp); 4917 } 4918 else 4919 { 4920 bool reduce_with_shift = have_whole_vector_shift (mode); 4921 int element_bitsize = tree_to_uhwi (bitsize); 4922 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype)); 4923 tree vec_temp; 4924 4925 /* Regardless of whether we have a whole vector shift, if we're 4926 emulating the operation via tree-vect-generic, we don't want 4927 to use it. Only the first round of the reduction is likely 4928 to still be profitable via emulation. */ 4929 /* ??? It might be better to emit a reduction tree code here, so that 4930 tree-vect-generic can expand the first round via bit tricks. */ 4931 if (!VECTOR_MODE_P (mode)) 4932 reduce_with_shift = false; 4933 else 4934 { 4935 optab optab = optab_for_tree_code (code, vectype, optab_default); 4936 if (optab_handler (optab, mode) == CODE_FOR_nothing) 4937 reduce_with_shift = false; 4938 } 4939 4940 if (reduce_with_shift && !slp_reduc) 4941 { 4942 int nelements = vec_size_in_bits / element_bitsize; 4943 unsigned char *sel = XALLOCAVEC (unsigned char, nelements); 4944 4945 int elt_offset; 4946 4947 tree zero_vec = build_zero_cst (vectype); 4948 /*** Case 2: Create: 4949 for (offset = nelements/2; offset >= 1; offset/=2) 4950 { 4951 Create: va' = vec_shift <va, offset> 4952 Create: va = vop <va, va'> 4953 } */ 4954 4955 tree rhs; 4956 4957 if (dump_enabled_p ()) 4958 dump_printf_loc (MSG_NOTE, vect_location, 4959 "Reduce using vector shifts\n"); 4960 4961 vec_dest = vect_create_destination_var (scalar_dest, vectype); 4962 new_temp = new_phi_result; 4963 for (elt_offset = nelements / 2; 4964 elt_offset >= 1; 4965 elt_offset /= 2) 4966 { 4967 calc_vec_perm_mask_for_shift (mode, elt_offset, sel); 4968 tree mask = vect_gen_perm_mask_any (vectype, sel); 4969 epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR, 4970 new_temp, zero_vec, mask); 4971 new_name = make_ssa_name (vec_dest, epilog_stmt); 4972 gimple_assign_set_lhs (epilog_stmt, new_name); 4973 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 4974 4975 epilog_stmt = gimple_build_assign (vec_dest, code, new_name, 4976 new_temp); 4977 new_temp = make_ssa_name (vec_dest, epilog_stmt); 4978 gimple_assign_set_lhs (epilog_stmt, new_temp); 4979 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 4980 } 4981 4982 /* 2.4 Extract the final scalar result. Create: 4983 s_out3 = extract_field <v_out2, bitpos> */ 4984 4985 if (dump_enabled_p ()) 4986 dump_printf_loc (MSG_NOTE, vect_location, 4987 "extract scalar result\n"); 4988 4989 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, 4990 bitsize, bitsize_zero_node); 4991 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs); 4992 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); 4993 gimple_assign_set_lhs (epilog_stmt, new_temp); 4994 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 4995 scalar_results.safe_push (new_temp); 4996 } 4997 else 4998 { 4999 /*** Case 3: Create: 5000 s = extract_field <v_out2, 0> 5001 for (offset = element_size; 5002 offset < vector_size; 5003 offset += element_size;) 5004 { 5005 Create: s' = extract_field <v_out2, offset> 5006 Create: s = op <s, s'> // For non SLP cases 5007 } */ 5008 5009 if (dump_enabled_p ()) 5010 dump_printf_loc (MSG_NOTE, vect_location, 5011 "Reduce using scalar code.\n"); 5012 5013 vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype)); 5014 FOR_EACH_VEC_ELT (new_phis, i, new_phi) 5015 { 5016 int bit_offset; 5017 if (gimple_code (new_phi) == GIMPLE_PHI) 5018 vec_temp = PHI_RESULT (new_phi); 5019 else 5020 vec_temp = gimple_assign_lhs (new_phi); 5021 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize, 5022 bitsize_zero_node); 5023 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs); 5024 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); 5025 gimple_assign_set_lhs (epilog_stmt, new_temp); 5026 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5027 5028 /* In SLP we don't need to apply reduction operation, so we just 5029 collect s' values in SCALAR_RESULTS. */ 5030 if (slp_reduc) 5031 scalar_results.safe_push (new_temp); 5032 5033 for (bit_offset = element_bitsize; 5034 bit_offset < vec_size_in_bits; 5035 bit_offset += element_bitsize) 5036 { 5037 tree bitpos = bitsize_int (bit_offset); 5038 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, 5039 bitsize, bitpos); 5040 5041 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs); 5042 new_name = make_ssa_name (new_scalar_dest, epilog_stmt); 5043 gimple_assign_set_lhs (epilog_stmt, new_name); 5044 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5045 5046 if (slp_reduc) 5047 { 5048 /* In SLP we don't need to apply reduction operation, so 5049 we just collect s' values in SCALAR_RESULTS. */ 5050 new_temp = new_name; 5051 scalar_results.safe_push (new_name); 5052 } 5053 else 5054 { 5055 epilog_stmt = gimple_build_assign (new_scalar_dest, code, 5056 new_name, new_temp); 5057 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); 5058 gimple_assign_set_lhs (epilog_stmt, new_temp); 5059 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5060 } 5061 } 5062 } 5063 5064 /* The only case where we need to reduce scalar results in SLP, is 5065 unrolling. If the size of SCALAR_RESULTS is greater than 5066 GROUP_SIZE, we reduce them combining elements modulo 5067 GROUP_SIZE. */ 5068 if (slp_reduc) 5069 { 5070 tree res, first_res, new_res; 5071 gimple *new_stmt; 5072 5073 /* Reduce multiple scalar results in case of SLP unrolling. */ 5074 for (j = group_size; scalar_results.iterate (j, &res); 5075 j++) 5076 { 5077 first_res = scalar_results[j % group_size]; 5078 new_stmt = gimple_build_assign (new_scalar_dest, code, 5079 first_res, res); 5080 new_res = make_ssa_name (new_scalar_dest, new_stmt); 5081 gimple_assign_set_lhs (new_stmt, new_res); 5082 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT); 5083 scalar_results[j % group_size] = new_res; 5084 } 5085 } 5086 else 5087 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */ 5088 scalar_results.safe_push (new_temp); 5089 } 5090 } 5091 5092 vect_finalize_reduction: 5093 5094 if (double_reduc) 5095 loop = loop->inner; 5096 5097 /* 2.5 Adjust the final result by the initial value of the reduction 5098 variable. (When such adjustment is not needed, then 5099 'adjustment_def' is zero). For example, if code is PLUS we create: 5100 new_temp = loop_exit_def + adjustment_def */ 5101 5102 if (adjustment_def) 5103 { 5104 gcc_assert (!slp_reduc); 5105 if (nested_in_vect_loop) 5106 { 5107 new_phi = new_phis[0]; 5108 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE); 5109 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def); 5110 new_dest = vect_create_destination_var (scalar_dest, vectype); 5111 } 5112 else 5113 { 5114 new_temp = scalar_results[0]; 5115 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE); 5116 expr = build2 (code, scalar_type, new_temp, adjustment_def); 5117 new_dest = vect_create_destination_var (scalar_dest, scalar_type); 5118 } 5119 5120 epilog_stmt = gimple_build_assign (new_dest, expr); 5121 new_temp = make_ssa_name (new_dest, epilog_stmt); 5122 gimple_assign_set_lhs (epilog_stmt, new_temp); 5123 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5124 if (nested_in_vect_loop) 5125 { 5126 set_vinfo_for_stmt (epilog_stmt, 5127 new_stmt_vec_info (epilog_stmt, loop_vinfo)); 5128 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) = 5129 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi)); 5130 5131 if (!double_reduc) 5132 scalar_results.quick_push (new_temp); 5133 else 5134 scalar_results[0] = new_temp; 5135 } 5136 else 5137 scalar_results[0] = new_temp; 5138 5139 new_phis[0] = epilog_stmt; 5140 } 5141 5142 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit 5143 phis with new adjusted scalar results, i.e., replace use <s_out0> 5144 with use <s_out4>. 5145 5146 Transform: 5147 loop_exit: 5148 s_out0 = phi <s_loop> # (scalar) EXIT_PHI 5149 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI 5150 v_out2 = reduce <v_out1> 5151 s_out3 = extract_field <v_out2, 0> 5152 s_out4 = adjust_result <s_out3> 5153 use <s_out0> 5154 use <s_out0> 5155 5156 into: 5157 5158 loop_exit: 5159 s_out0 = phi <s_loop> # (scalar) EXIT_PHI 5160 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI 5161 v_out2 = reduce <v_out1> 5162 s_out3 = extract_field <v_out2, 0> 5163 s_out4 = adjust_result <s_out3> 5164 use <s_out4> 5165 use <s_out4> */ 5166 5167 5168 /* In SLP reduction chain we reduce vector results into one vector if 5169 necessary, hence we set here GROUP_SIZE to 1. SCALAR_DEST is the LHS of 5170 the last stmt in the reduction chain, since we are looking for the loop 5171 exit phi node. */ 5172 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))) 5173 { 5174 gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]; 5175 /* Handle reduction patterns. */ 5176 if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt))) 5177 dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)); 5178 5179 scalar_dest = gimple_assign_lhs (dest_stmt); 5180 group_size = 1; 5181 } 5182 5183 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in 5184 case that GROUP_SIZE is greater than vectorization factor). Therefore, we 5185 need to match SCALAR_RESULTS with corresponding statements. The first 5186 (GROUP_SIZE / number of new vector stmts) scalar results correspond to 5187 the first vector stmt, etc. 5188 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */ 5189 if (group_size > new_phis.length ()) 5190 { 5191 ratio = group_size / new_phis.length (); 5192 gcc_assert (!(group_size % new_phis.length ())); 5193 } 5194 else 5195 ratio = 1; 5196 5197 for (k = 0; k < group_size; k++) 5198 { 5199 if (k % ratio == 0) 5200 { 5201 epilog_stmt = new_phis[k / ratio]; 5202 reduction_phi = reduction_phis[k / ratio]; 5203 if (double_reduc) 5204 inner_phi = inner_phis[k / ratio]; 5205 } 5206 5207 if (slp_reduc) 5208 { 5209 gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k]; 5210 5211 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt)); 5212 /* SLP statements can't participate in patterns. */ 5213 gcc_assert (!orig_stmt); 5214 scalar_dest = gimple_assign_lhs (current_stmt); 5215 } 5216 5217 phis.create (3); 5218 /* Find the loop-closed-use at the loop exit of the original scalar 5219 result. (The reduction result is expected to have two immediate uses - 5220 one at the latch block, and one at the loop exit). */ 5221 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest) 5222 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))) 5223 && !is_gimple_debug (USE_STMT (use_p))) 5224 phis.safe_push (USE_STMT (use_p)); 5225 5226 /* While we expect to have found an exit_phi because of loop-closed-ssa 5227 form we can end up without one if the scalar cycle is dead. */ 5228 5229 FOR_EACH_VEC_ELT (phis, i, exit_phi) 5230 { 5231 if (outer_loop) 5232 { 5233 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi); 5234 gphi *vect_phi; 5235 5236 /* FORNOW. Currently not supporting the case that an inner-loop 5237 reduction is not used in the outer-loop (but only outside the 5238 outer-loop), unless it is double reduction. */ 5239 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo) 5240 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)) 5241 || double_reduc); 5242 5243 if (double_reduc) 5244 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi; 5245 else 5246 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt; 5247 if (!double_reduc 5248 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo) 5249 != vect_double_reduction_def) 5250 continue; 5251 5252 /* Handle double reduction: 5253 5254 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop) 5255 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop) 5256 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop) 5257 stmt4: s2 = phi <s4> - double reduction stmt (outer loop) 5258 5259 At that point the regular reduction (stmt2 and stmt3) is 5260 already vectorized, as well as the exit phi node, stmt4. 5261 Here we vectorize the phi node of double reduction, stmt1, and 5262 update all relevant statements. */ 5263 5264 /* Go through all the uses of s2 to find double reduction phi 5265 node, i.e., stmt1 above. */ 5266 orig_name = PHI_RESULT (exit_phi); 5267 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name) 5268 { 5269 stmt_vec_info use_stmt_vinfo; 5270 stmt_vec_info new_phi_vinfo; 5271 tree vect_phi_init, preheader_arg, vect_phi_res, init_def; 5272 basic_block bb = gimple_bb (use_stmt); 5273 gimple *use; 5274 5275 /* Check that USE_STMT is really double reduction phi 5276 node. */ 5277 if (gimple_code (use_stmt) != GIMPLE_PHI 5278 || gimple_phi_num_args (use_stmt) != 2 5279 || bb->loop_father != outer_loop) 5280 continue; 5281 use_stmt_vinfo = vinfo_for_stmt (use_stmt); 5282 if (!use_stmt_vinfo 5283 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo) 5284 != vect_double_reduction_def) 5285 continue; 5286 5287 /* Create vector phi node for double reduction: 5288 vs1 = phi <vs0, vs2> 5289 vs1 was created previously in this function by a call to 5290 vect_get_vec_def_for_operand and is stored in 5291 vec_initial_def; 5292 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI; 5293 vs0 is created here. */ 5294 5295 /* Create vector phi node. */ 5296 vect_phi = create_phi_node (vec_initial_def, bb); 5297 new_phi_vinfo = new_stmt_vec_info (vect_phi, 5298 loop_vec_info_for_loop (outer_loop)); 5299 set_vinfo_for_stmt (vect_phi, new_phi_vinfo); 5300 5301 /* Create vs0 - initial def of the double reduction phi. */ 5302 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt, 5303 loop_preheader_edge (outer_loop)); 5304 init_def = get_initial_def_for_reduction (stmt, 5305 preheader_arg, NULL); 5306 vect_phi_init = vect_init_vector (use_stmt, init_def, 5307 vectype, NULL); 5308 5309 /* Update phi node arguments with vs0 and vs2. */ 5310 add_phi_arg (vect_phi, vect_phi_init, 5311 loop_preheader_edge (outer_loop), 5312 UNKNOWN_LOCATION); 5313 add_phi_arg (vect_phi, PHI_RESULT (inner_phi), 5314 loop_latch_edge (outer_loop), UNKNOWN_LOCATION); 5315 if (dump_enabled_p ()) 5316 { 5317 dump_printf_loc (MSG_NOTE, vect_location, 5318 "created double reduction phi node: "); 5319 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0); 5320 } 5321 5322 vect_phi_res = PHI_RESULT (vect_phi); 5323 5324 /* Replace the use, i.e., set the correct vs1 in the regular 5325 reduction phi node. FORNOW, NCOPIES is always 1, so the 5326 loop is redundant. */ 5327 use = reduction_phi; 5328 for (j = 0; j < ncopies; j++) 5329 { 5330 edge pr_edge = loop_preheader_edge (loop); 5331 SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res); 5332 use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use)); 5333 } 5334 } 5335 } 5336 } 5337 5338 phis.release (); 5339 if (nested_in_vect_loop) 5340 { 5341 if (double_reduc) 5342 loop = outer_loop; 5343 else 5344 continue; 5345 } 5346 5347 phis.create (3); 5348 /* Find the loop-closed-use at the loop exit of the original scalar 5349 result. (The reduction result is expected to have two immediate uses, 5350 one at the latch block, and one at the loop exit). For double 5351 reductions we are looking for exit phis of the outer loop. */ 5352 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest) 5353 { 5354 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))) 5355 { 5356 if (!is_gimple_debug (USE_STMT (use_p))) 5357 phis.safe_push (USE_STMT (use_p)); 5358 } 5359 else 5360 { 5361 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI) 5362 { 5363 tree phi_res = PHI_RESULT (USE_STMT (use_p)); 5364 5365 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res) 5366 { 5367 if (!flow_bb_inside_loop_p (loop, 5368 gimple_bb (USE_STMT (phi_use_p))) 5369 && !is_gimple_debug (USE_STMT (phi_use_p))) 5370 phis.safe_push (USE_STMT (phi_use_p)); 5371 } 5372 } 5373 } 5374 } 5375 5376 FOR_EACH_VEC_ELT (phis, i, exit_phi) 5377 { 5378 /* Replace the uses: */ 5379 orig_name = PHI_RESULT (exit_phi); 5380 scalar_result = scalar_results[k]; 5381 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name) 5382 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter) 5383 SET_USE (use_p, scalar_result); 5384 } 5385 5386 phis.release (); 5387 } 5388 } 5389 5390 5391 /* Function is_nonwrapping_integer_induction. 5392 5393 Check if STMT (which is part of loop LOOP) both increments and 5394 does not cause overflow. */ 5395 5396 static bool 5397 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop) 5398 { 5399 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt); 5400 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo); 5401 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo); 5402 tree lhs_type = TREE_TYPE (gimple_phi_result (stmt)); 5403 widest_int ni, max_loop_value, lhs_max; 5404 bool overflow = false; 5405 5406 /* Make sure the loop is integer based. */ 5407 if (TREE_CODE (base) != INTEGER_CST 5408 || TREE_CODE (step) != INTEGER_CST) 5409 return false; 5410 5411 /* Check that the induction increments. */ 5412 if (tree_int_cst_sgn (step) == -1) 5413 return false; 5414 5415 /* Check that the max size of the loop will not wrap. */ 5416 5417 if (TYPE_OVERFLOW_UNDEFINED (lhs_type)) 5418 return true; 5419 5420 if (! max_stmt_executions (loop, &ni)) 5421 return false; 5422 5423 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type), 5424 &overflow); 5425 if (overflow) 5426 return false; 5427 5428 max_loop_value = wi::add (wi::to_widest (base), max_loop_value, 5429 TYPE_SIGN (lhs_type), &overflow); 5430 if (overflow) 5431 return false; 5432 5433 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type)) 5434 <= TYPE_PRECISION (lhs_type)); 5435 } 5436 5437 /* Function vectorizable_reduction. 5438 5439 Check if STMT performs a reduction operation that can be vectorized. 5440 If VEC_STMT is also passed, vectorize the STMT: create a vectorized 5441 stmt to replace it, put it in VEC_STMT, and insert it at GSI. 5442 Return FALSE if not a vectorizable STMT, TRUE otherwise. 5443 5444 This function also handles reduction idioms (patterns) that have been 5445 recognized in advance during vect_pattern_recog. In this case, STMT may be 5446 of this form: 5447 X = pattern_expr (arg0, arg1, ..., X) 5448 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original 5449 sequence that had been detected and replaced by the pattern-stmt (STMT). 5450 5451 This function also handles reduction of condition expressions, for example: 5452 for (int i = 0; i < N; i++) 5453 if (a[i] < value) 5454 last = a[i]; 5455 This is handled by vectorising the loop and creating an additional vector 5456 containing the loop indexes for which "a[i] < value" was true. In the 5457 function epilogue this is reduced to a single max value and then used to 5458 index into the vector of results. 5459 5460 In some cases of reduction patterns, the type of the reduction variable X is 5461 different than the type of the other arguments of STMT. 5462 In such cases, the vectype that is used when transforming STMT into a vector 5463 stmt is different than the vectype that is used to determine the 5464 vectorization factor, because it consists of a different number of elements 5465 than the actual number of elements that are being operated upon in parallel. 5466 5467 For example, consider an accumulation of shorts into an int accumulator. 5468 On some targets it's possible to vectorize this pattern operating on 8 5469 shorts at a time (hence, the vectype for purposes of determining the 5470 vectorization factor should be V8HI); on the other hand, the vectype that 5471 is used to create the vector form is actually V4SI (the type of the result). 5472 5473 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that 5474 indicates what is the actual level of parallelism (V8HI in the example), so 5475 that the right vectorization factor would be derived. This vectype 5476 corresponds to the type of arguments to the reduction stmt, and should *NOT* 5477 be used to create the vectorized stmt. The right vectype for the vectorized 5478 stmt is obtained from the type of the result X: 5479 get_vectype_for_scalar_type (TREE_TYPE (X)) 5480 5481 This means that, contrary to "regular" reductions (or "regular" stmts in 5482 general), the following equation: 5483 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X)) 5484 does *NOT* necessarily hold for reduction patterns. */ 5485 5486 bool 5487 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi, 5488 gimple **vec_stmt, slp_tree slp_node) 5489 { 5490 tree vec_dest; 5491 tree scalar_dest; 5492 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE; 5493 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 5494 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info); 5495 tree vectype_in = NULL_TREE; 5496 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 5497 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 5498 enum tree_code code, orig_code, epilog_reduc_code; 5499 machine_mode vec_mode; 5500 int op_type; 5501 optab optab, reduc_optab; 5502 tree new_temp = NULL_TREE; 5503 gimple *def_stmt; 5504 enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type; 5505 gphi *new_phi = NULL; 5506 gimple *cond_reduc_def_stmt = NULL; 5507 tree scalar_type; 5508 bool is_simple_use; 5509 gimple *orig_stmt; 5510 stmt_vec_info orig_stmt_info; 5511 tree expr = NULL_TREE; 5512 int i; 5513 int ncopies; 5514 int epilog_copies; 5515 stmt_vec_info prev_stmt_info, prev_phi_info; 5516 bool single_defuse_cycle = false; 5517 tree reduc_def = NULL_TREE; 5518 gimple *new_stmt = NULL; 5519 int j; 5520 tree ops[3]; 5521 bool nested_cycle = false, found_nested_cycle_def = false; 5522 gimple *reduc_def_stmt = NULL; 5523 bool double_reduc = false, dummy; 5524 basic_block def_bb; 5525 struct loop * def_stmt_loop, *outer_loop = NULL; 5526 tree def_arg; 5527 gimple *def_arg_stmt; 5528 auto_vec<tree> vec_oprnds0; 5529 auto_vec<tree> vec_oprnds1; 5530 auto_vec<tree> vect_defs; 5531 auto_vec<gimple *> phis; 5532 int vec_num; 5533 tree def0, def1, tem, op1 = NULL_TREE; 5534 bool first_p = true; 5535 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE; 5536 tree cond_reduc_val = NULL_TREE; 5537 5538 /* In case of reduction chain we switch to the first stmt in the chain, but 5539 we don't update STMT_INFO, since only the last stmt is marked as reduction 5540 and has reduction properties. */ 5541 if (GROUP_FIRST_ELEMENT (stmt_info) 5542 && GROUP_FIRST_ELEMENT (stmt_info) != stmt) 5543 { 5544 stmt = GROUP_FIRST_ELEMENT (stmt_info); 5545 first_p = false; 5546 } 5547 5548 if (nested_in_vect_loop_p (loop, stmt)) 5549 { 5550 outer_loop = loop; 5551 loop = loop->inner; 5552 nested_cycle = true; 5553 } 5554 5555 /* 1. Is vectorizable reduction? */ 5556 /* Not supportable if the reduction variable is used in the loop, unless 5557 it's a reduction chain. */ 5558 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer 5559 && !GROUP_FIRST_ELEMENT (stmt_info)) 5560 return false; 5561 5562 /* Reductions that are not used even in an enclosing outer-loop, 5563 are expected to be "live" (used out of the loop). */ 5564 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope 5565 && !STMT_VINFO_LIVE_P (stmt_info)) 5566 return false; 5567 5568 /* Make sure it was already recognized as a reduction computation. */ 5569 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def 5570 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle) 5571 return false; 5572 5573 /* 2. Has this been recognized as a reduction pattern? 5574 5575 Check if STMT represents a pattern that has been recognized 5576 in earlier analysis stages. For stmts that represent a pattern, 5577 the STMT_VINFO_RELATED_STMT field records the last stmt in 5578 the original sequence that constitutes the pattern. */ 5579 5580 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt)); 5581 if (orig_stmt) 5582 { 5583 orig_stmt_info = vinfo_for_stmt (orig_stmt); 5584 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)); 5585 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info)); 5586 } 5587 5588 /* 3. Check the operands of the operation. The first operands are defined 5589 inside the loop body. The last operand is the reduction variable, 5590 which is defined by the loop-header-phi. */ 5591 5592 gcc_assert (is_gimple_assign (stmt)); 5593 5594 /* Flatten RHS. */ 5595 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt))) 5596 { 5597 case GIMPLE_SINGLE_RHS: 5598 op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)); 5599 if (op_type == ternary_op) 5600 { 5601 tree rhs = gimple_assign_rhs1 (stmt); 5602 ops[0] = TREE_OPERAND (rhs, 0); 5603 ops[1] = TREE_OPERAND (rhs, 1); 5604 ops[2] = TREE_OPERAND (rhs, 2); 5605 code = TREE_CODE (rhs); 5606 } 5607 else 5608 return false; 5609 break; 5610 5611 case GIMPLE_BINARY_RHS: 5612 code = gimple_assign_rhs_code (stmt); 5613 op_type = TREE_CODE_LENGTH (code); 5614 gcc_assert (op_type == binary_op); 5615 ops[0] = gimple_assign_rhs1 (stmt); 5616 ops[1] = gimple_assign_rhs2 (stmt); 5617 break; 5618 5619 case GIMPLE_TERNARY_RHS: 5620 code = gimple_assign_rhs_code (stmt); 5621 op_type = TREE_CODE_LENGTH (code); 5622 gcc_assert (op_type == ternary_op); 5623 ops[0] = gimple_assign_rhs1 (stmt); 5624 ops[1] = gimple_assign_rhs2 (stmt); 5625 ops[2] = gimple_assign_rhs3 (stmt); 5626 break; 5627 5628 case GIMPLE_UNARY_RHS: 5629 return false; 5630 5631 default: 5632 gcc_unreachable (); 5633 } 5634 /* The default is that the reduction variable is the last in statement. */ 5635 int reduc_index = op_type - 1; 5636 if (code == MINUS_EXPR) 5637 reduc_index = 0; 5638 5639 if (code == COND_EXPR && slp_node) 5640 return false; 5641 5642 scalar_dest = gimple_assign_lhs (stmt); 5643 scalar_type = TREE_TYPE (scalar_dest); 5644 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type) 5645 && !SCALAR_FLOAT_TYPE_P (scalar_type)) 5646 return false; 5647 5648 /* Do not try to vectorize bit-precision reductions. */ 5649 if ((TYPE_PRECISION (scalar_type) 5650 != GET_MODE_PRECISION (TYPE_MODE (scalar_type)))) 5651 return false; 5652 5653 /* All uses but the last are expected to be defined in the loop. 5654 The last use is the reduction variable. In case of nested cycle this 5655 assumption is not true: we use reduc_index to record the index of the 5656 reduction variable. */ 5657 for (i = 0; i < op_type; i++) 5658 { 5659 if (i == reduc_index) 5660 continue; 5661 5662 /* The condition of COND_EXPR is checked in vectorizable_condition(). */ 5663 if (i == 0 && code == COND_EXPR) 5664 continue; 5665 5666 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, 5667 &def_stmt, &dt, &tem); 5668 if (!vectype_in) 5669 vectype_in = tem; 5670 gcc_assert (is_simple_use); 5671 5672 if (dt != vect_internal_def 5673 && dt != vect_external_def 5674 && dt != vect_constant_def 5675 && dt != vect_induction_def 5676 && !(dt == vect_nested_cycle && nested_cycle)) 5677 return false; 5678 5679 if (dt == vect_nested_cycle) 5680 { 5681 found_nested_cycle_def = true; 5682 reduc_def_stmt = def_stmt; 5683 reduc_index = i; 5684 } 5685 5686 if (i == 1 && code == COND_EXPR) 5687 { 5688 /* Record how value of COND_EXPR is defined. */ 5689 if (dt == vect_constant_def) 5690 { 5691 cond_reduc_dt = dt; 5692 cond_reduc_val = ops[i]; 5693 } 5694 if (dt == vect_induction_def 5695 && def_stmt != NULL 5696 && is_nonwrapping_integer_induction (def_stmt, loop)) 5697 { 5698 cond_reduc_dt = dt; 5699 cond_reduc_def_stmt = def_stmt; 5700 } 5701 } 5702 } 5703 5704 is_simple_use = vect_is_simple_use (ops[reduc_index], loop_vinfo, 5705 &def_stmt, &dt, &tem); 5706 if (!vectype_in) 5707 vectype_in = tem; 5708 gcc_assert (is_simple_use); 5709 if (!found_nested_cycle_def) 5710 reduc_def_stmt = def_stmt; 5711 5712 if (reduc_def_stmt && gimple_code (reduc_def_stmt) != GIMPLE_PHI) 5713 return false; 5714 5715 if (!(dt == vect_reduction_def 5716 || dt == vect_nested_cycle 5717 || ((dt == vect_internal_def || dt == vect_external_def 5718 || dt == vect_constant_def || dt == vect_induction_def) 5719 && nested_cycle && found_nested_cycle_def))) 5720 { 5721 /* For pattern recognized stmts, orig_stmt might be a reduction, 5722 but some helper statements for the pattern might not, or 5723 might be COND_EXPRs with reduction uses in the condition. */ 5724 gcc_assert (orig_stmt); 5725 return false; 5726 } 5727 5728 enum vect_reduction_type v_reduc_type; 5729 gimple *tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt, 5730 !nested_cycle, &dummy, false, 5731 &v_reduc_type); 5732 5733 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type; 5734 /* If we have a condition reduction, see if we can simplify it further. */ 5735 if (v_reduc_type == COND_REDUCTION) 5736 { 5737 if (cond_reduc_dt == vect_induction_def) 5738 { 5739 stmt_vec_info cond_stmt_vinfo = vinfo_for_stmt (cond_reduc_def_stmt); 5740 tree base 5741 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo); 5742 5743 gcc_assert (TREE_CODE (base) == INTEGER_CST); 5744 cond_reduc_val = NULL_TREE; 5745 /* Find a suitable value below base; punt if base is the minimum 5746 value of the type for now. */ 5747 if (tree_int_cst_sgn (base) == 1) 5748 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0); 5749 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)), base)) 5750 cond_reduc_val 5751 = int_const_binop (MINUS_EXPR, base, integer_one_node); 5752 if (cond_reduc_val) 5753 { 5754 if (dump_enabled_p ()) 5755 dump_printf_loc (MSG_NOTE, vect_location, 5756 "condition expression based on " 5757 "integer induction.\n"); 5758 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) 5759 = INTEGER_INDUC_COND_REDUCTION; 5760 } 5761 } 5762 5763 /* Loop peeling modifies initial value of reduction PHI, which 5764 makes the reduction stmt to be transformed different to the 5765 original stmt analyzed. We need to record reduction code for 5766 CONST_COND_REDUCTION type reduction at analyzing stage, thus 5767 it can be used directly at transform stage. */ 5768 if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR 5769 || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR) 5770 { 5771 /* Also set the reduction type to CONST_COND_REDUCTION. */ 5772 gcc_assert (cond_reduc_dt == vect_constant_def); 5773 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION; 5774 } 5775 else if (cond_reduc_dt == vect_constant_def) 5776 { 5777 enum vect_def_type cond_initial_dt; 5778 gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]); 5779 tree cond_initial_val 5780 = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop)); 5781 5782 gcc_assert (cond_reduc_val != NULL_TREE); 5783 vect_is_simple_use (cond_initial_val, loop_vinfo, 5784 &def_stmt, &cond_initial_dt); 5785 if (cond_initial_dt == vect_constant_def 5786 && types_compatible_p (TREE_TYPE (cond_initial_val), 5787 TREE_TYPE (cond_reduc_val))) 5788 { 5789 tree e = fold_build2 (LE_EXPR, boolean_type_node, 5790 cond_initial_val, cond_reduc_val); 5791 if (e && (integer_onep (e) || integer_zerop (e))) 5792 { 5793 if (dump_enabled_p ()) 5794 dump_printf_loc (MSG_NOTE, vect_location, 5795 "condition expression based on " 5796 "compile time constant.\n"); 5797 /* Record reduction code at analysis stage. */ 5798 STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) 5799 = integer_onep (e) ? MAX_EXPR : MIN_EXPR; 5800 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) 5801 = CONST_COND_REDUCTION; 5802 } 5803 } 5804 } 5805 } 5806 5807 if (orig_stmt) 5808 gcc_assert (tmp == orig_stmt 5809 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt); 5810 else 5811 /* We changed STMT to be the first stmt in reduction chain, hence we 5812 check that in this case the first element in the chain is STMT. */ 5813 gcc_assert (stmt == tmp 5814 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt); 5815 5816 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt))) 5817 return false; 5818 5819 if (slp_node) 5820 ncopies = 1; 5821 else 5822 ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo) 5823 / TYPE_VECTOR_SUBPARTS (vectype_in)); 5824 5825 gcc_assert (ncopies >= 1); 5826 5827 vec_mode = TYPE_MODE (vectype_in); 5828 5829 if (code == COND_EXPR) 5830 { 5831 /* Only call during the analysis stage, otherwise we'll lose 5832 STMT_VINFO_TYPE. */ 5833 if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL, 5834 ops[reduc_index], 0, NULL)) 5835 { 5836 if (dump_enabled_p ()) 5837 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5838 "unsupported condition in reduction\n"); 5839 return false; 5840 } 5841 } 5842 else 5843 { 5844 /* 4. Supportable by target? */ 5845 5846 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR 5847 || code == LROTATE_EXPR || code == RROTATE_EXPR) 5848 { 5849 /* Shifts and rotates are only supported by vectorizable_shifts, 5850 not vectorizable_reduction. */ 5851 if (dump_enabled_p ()) 5852 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5853 "unsupported shift or rotation.\n"); 5854 return false; 5855 } 5856 5857 /* 4.1. check support for the operation in the loop */ 5858 optab = optab_for_tree_code (code, vectype_in, optab_default); 5859 if (!optab) 5860 { 5861 if (dump_enabled_p ()) 5862 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5863 "no optab.\n"); 5864 5865 return false; 5866 } 5867 5868 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing) 5869 { 5870 if (dump_enabled_p ()) 5871 dump_printf (MSG_NOTE, "op not supported by target.\n"); 5872 5873 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD 5874 || LOOP_VINFO_VECT_FACTOR (loop_vinfo) 5875 < vect_min_worthwhile_factor (code)) 5876 return false; 5877 5878 if (dump_enabled_p ()) 5879 dump_printf (MSG_NOTE, "proceeding using word mode.\n"); 5880 } 5881 5882 /* Worthwhile without SIMD support? */ 5883 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in)) 5884 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) 5885 < vect_min_worthwhile_factor (code)) 5886 { 5887 if (dump_enabled_p ()) 5888 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5889 "not worthwhile without SIMD support.\n"); 5890 5891 return false; 5892 } 5893 } 5894 5895 /* 4.2. Check support for the epilog operation. 5896 5897 If STMT represents a reduction pattern, then the type of the 5898 reduction variable may be different than the type of the rest 5899 of the arguments. For example, consider the case of accumulation 5900 of shorts into an int accumulator; The original code: 5901 S1: int_a = (int) short_a; 5902 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>; 5903 5904 was replaced with: 5905 STMT: int_acc = widen_sum <short_a, int_acc> 5906 5907 This means that: 5908 1. The tree-code that is used to create the vector operation in the 5909 epilog code (that reduces the partial results) is not the 5910 tree-code of STMT, but is rather the tree-code of the original 5911 stmt from the pattern that STMT is replacing. I.e, in the example 5912 above we want to use 'widen_sum' in the loop, but 'plus' in the 5913 epilog. 5914 2. The type (mode) we use to check available target support 5915 for the vector operation to be created in the *epilog*, is 5916 determined by the type of the reduction variable (in the example 5917 above we'd check this: optab_handler (plus_optab, vect_int_mode])). 5918 However the type (mode) we use to check available target support 5919 for the vector operation to be created *inside the loop*, is 5920 determined by the type of the other arguments to STMT (in the 5921 example we'd check this: optab_handler (widen_sum_optab, 5922 vect_short_mode)). 5923 5924 This is contrary to "regular" reductions, in which the types of all 5925 the arguments are the same as the type of the reduction variable. 5926 For "regular" reductions we can therefore use the same vector type 5927 (and also the same tree-code) when generating the epilog code and 5928 when generating the code inside the loop. */ 5929 5930 if (orig_stmt) 5931 { 5932 /* This is a reduction pattern: get the vectype from the type of the 5933 reduction variable, and get the tree-code from orig_stmt. */ 5934 gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) 5935 == TREE_CODE_REDUCTION); 5936 orig_code = gimple_assign_rhs_code (orig_stmt); 5937 gcc_assert (vectype_out); 5938 vec_mode = TYPE_MODE (vectype_out); 5939 } 5940 else 5941 { 5942 /* Regular reduction: use the same vectype and tree-code as used for 5943 the vector code inside the loop can be used for the epilog code. */ 5944 orig_code = code; 5945 5946 if (code == MINUS_EXPR) 5947 orig_code = PLUS_EXPR; 5948 5949 /* For simple condition reductions, replace with the actual expression 5950 we want to base our reduction around. */ 5951 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == CONST_COND_REDUCTION) 5952 { 5953 orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info); 5954 gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR); 5955 } 5956 else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) 5957 == INTEGER_INDUC_COND_REDUCTION) 5958 orig_code = MAX_EXPR; 5959 } 5960 5961 if (nested_cycle) 5962 { 5963 def_bb = gimple_bb (reduc_def_stmt); 5964 def_stmt_loop = def_bb->loop_father; 5965 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt, 5966 loop_preheader_edge (def_stmt_loop)); 5967 if (TREE_CODE (def_arg) == SSA_NAME 5968 && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg)) 5969 && gimple_code (def_arg_stmt) == GIMPLE_PHI 5970 && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt)) 5971 && vinfo_for_stmt (def_arg_stmt) 5972 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt)) 5973 == vect_double_reduction_def) 5974 double_reduc = true; 5975 } 5976 5977 epilog_reduc_code = ERROR_MARK; 5978 5979 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) != COND_REDUCTION) 5980 { 5981 if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code)) 5982 { 5983 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out, 5984 optab_default); 5985 if (!reduc_optab) 5986 { 5987 if (dump_enabled_p ()) 5988 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5989 "no optab for reduction.\n"); 5990 5991 epilog_reduc_code = ERROR_MARK; 5992 } 5993 else if (optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing) 5994 { 5995 if (dump_enabled_p ()) 5996 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5997 "reduc op not supported by target.\n"); 5998 5999 epilog_reduc_code = ERROR_MARK; 6000 } 6001 6002 /* When epilog_reduc_code is ERROR_MARK then a reduction will be 6003 generated in the epilog using multiple expressions. This does not 6004 work for condition reductions. */ 6005 if (epilog_reduc_code == ERROR_MARK 6006 && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) 6007 == INTEGER_INDUC_COND_REDUCTION 6008 || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) 6009 == CONST_COND_REDUCTION)) 6010 { 6011 if (dump_enabled_p ()) 6012 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6013 "no reduc code for scalar code.\n"); 6014 return false; 6015 } 6016 } 6017 else 6018 { 6019 if (!nested_cycle || double_reduc) 6020 { 6021 if (dump_enabled_p ()) 6022 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6023 "no reduc code for scalar code.\n"); 6024 6025 return false; 6026 } 6027 } 6028 } 6029 else 6030 { 6031 int scalar_precision = GET_MODE_PRECISION (TYPE_MODE (scalar_type)); 6032 cr_index_scalar_type = make_unsigned_type (scalar_precision); 6033 cr_index_vector_type = build_vector_type 6034 (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype_out)); 6035 6036 epilog_reduc_code = REDUC_MAX_EXPR; 6037 optab = optab_for_tree_code (REDUC_MAX_EXPR, cr_index_vector_type, 6038 optab_default); 6039 if (optab_handler (optab, TYPE_MODE (cr_index_vector_type)) 6040 == CODE_FOR_nothing) 6041 { 6042 if (dump_enabled_p ()) 6043 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6044 "reduc max op not supported by target.\n"); 6045 return false; 6046 } 6047 } 6048 6049 if ((double_reduc 6050 || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) != TREE_CODE_REDUCTION) 6051 && ncopies > 1) 6052 { 6053 if (dump_enabled_p ()) 6054 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6055 "multiple types in double reduction or condition " 6056 "reduction.\n"); 6057 return false; 6058 } 6059 6060 /* In case of widenning multiplication by a constant, we update the type 6061 of the constant to be the type of the other operand. We check that the 6062 constant fits the type in the pattern recognition pass. */ 6063 if (code == DOT_PROD_EXPR 6064 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1]))) 6065 { 6066 if (TREE_CODE (ops[0]) == INTEGER_CST) 6067 ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]); 6068 else if (TREE_CODE (ops[1]) == INTEGER_CST) 6069 ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]); 6070 else 6071 { 6072 if (dump_enabled_p ()) 6073 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6074 "invalid types in dot-prod\n"); 6075 6076 return false; 6077 } 6078 } 6079 6080 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION) 6081 { 6082 widest_int ni; 6083 6084 if (! max_loop_iterations (loop, &ni)) 6085 { 6086 if (dump_enabled_p ()) 6087 dump_printf_loc (MSG_NOTE, vect_location, 6088 "loop count not known, cannot create cond " 6089 "reduction.\n"); 6090 return false; 6091 } 6092 /* Convert backedges to iterations. */ 6093 ni += 1; 6094 6095 /* The additional index will be the same type as the condition. Check 6096 that the loop can fit into this less one (because we'll use up the 6097 zero slot for when there are no matches). */ 6098 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type); 6099 if (wi::geu_p (ni, wi::to_widest (max_index))) 6100 { 6101 if (dump_enabled_p ()) 6102 dump_printf_loc (MSG_NOTE, vect_location, 6103 "loop size is greater than data size.\n"); 6104 return false; 6105 } 6106 } 6107 6108 if (!vec_stmt) /* transformation not required. */ 6109 { 6110 if (first_p 6111 && !vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies, 6112 reduc_index)) 6113 return false; 6114 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type; 6115 return true; 6116 } 6117 6118 /** Transform. **/ 6119 6120 if (dump_enabled_p ()) 6121 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n"); 6122 6123 /* FORNOW: Multiple types are not supported for condition. */ 6124 if (code == COND_EXPR) 6125 gcc_assert (ncopies == 1); 6126 6127 /* Create the destination vector */ 6128 vec_dest = vect_create_destination_var (scalar_dest, vectype_out); 6129 6130 /* In case the vectorization factor (VF) is bigger than the number 6131 of elements that we can fit in a vectype (nunits), we have to generate 6132 more than one vector stmt - i.e - we need to "unroll" the 6133 vector stmt by a factor VF/nunits. For more details see documentation 6134 in vectorizable_operation. */ 6135 6136 /* If the reduction is used in an outer loop we need to generate 6137 VF intermediate results, like so (e.g. for ncopies=2): 6138 r0 = phi (init, r0) 6139 r1 = phi (init, r1) 6140 r0 = x0 + r0; 6141 r1 = x1 + r1; 6142 (i.e. we generate VF results in 2 registers). 6143 In this case we have a separate def-use cycle for each copy, and therefore 6144 for each copy we get the vector def for the reduction variable from the 6145 respective phi node created for this copy. 6146 6147 Otherwise (the reduction is unused in the loop nest), we can combine 6148 together intermediate results, like so (e.g. for ncopies=2): 6149 r = phi (init, r) 6150 r = x0 + r; 6151 r = x1 + r; 6152 (i.e. we generate VF/2 results in a single register). 6153 In this case for each copy we get the vector def for the reduction variable 6154 from the vectorized reduction operation generated in the previous iteration. 6155 */ 6156 6157 if (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live) 6158 { 6159 single_defuse_cycle = true; 6160 epilog_copies = 1; 6161 } 6162 else 6163 epilog_copies = ncopies; 6164 6165 prev_stmt_info = NULL; 6166 prev_phi_info = NULL; 6167 if (slp_node) 6168 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); 6169 else 6170 { 6171 vec_num = 1; 6172 vec_oprnds0.create (1); 6173 if (op_type == ternary_op) 6174 vec_oprnds1.create (1); 6175 } 6176 6177 phis.create (vec_num); 6178 vect_defs.create (vec_num); 6179 if (!slp_node) 6180 vect_defs.quick_push (NULL_TREE); 6181 6182 for (j = 0; j < ncopies; j++) 6183 { 6184 if (j == 0 || !single_defuse_cycle) 6185 { 6186 for (i = 0; i < vec_num; i++) 6187 { 6188 /* Create the reduction-phi that defines the reduction 6189 operand. */ 6190 new_phi = create_phi_node (vec_dest, loop->header); 6191 set_vinfo_for_stmt (new_phi, 6192 new_stmt_vec_info (new_phi, loop_vinfo)); 6193 if (j == 0 || slp_node) 6194 phis.quick_push (new_phi); 6195 } 6196 } 6197 6198 if (code == COND_EXPR) 6199 { 6200 gcc_assert (!slp_node); 6201 vectorizable_condition (stmt, gsi, vec_stmt, 6202 PHI_RESULT (phis[0]), 6203 reduc_index, NULL); 6204 /* Multiple types are not supported for condition. */ 6205 break; 6206 } 6207 6208 /* Handle uses. */ 6209 if (j == 0) 6210 { 6211 if (slp_node) 6212 { 6213 /* Get vec defs for all the operands except the reduction index, 6214 ensuring the ordering of the ops in the vector is kept. */ 6215 auto_vec<tree, 3> slp_ops; 6216 auto_vec<vec<tree>, 3> vec_defs; 6217 6218 slp_ops.quick_push (reduc_index == 0 ? NULL : ops[0]); 6219 slp_ops.quick_push (reduc_index == 1 ? NULL : ops[1]); 6220 if (op_type == ternary_op) 6221 slp_ops.quick_push (reduc_index == 2 ? NULL : ops[2]); 6222 6223 vect_get_slp_defs (slp_ops, slp_node, &vec_defs, -1); 6224 6225 vec_oprnds0.safe_splice (vec_defs[reduc_index == 0 ? 1 : 0]); 6226 vec_defs[reduc_index == 0 ? 1 : 0].release (); 6227 if (op_type == ternary_op) 6228 { 6229 vec_oprnds1.safe_splice (vec_defs[reduc_index == 2 ? 1 : 2]); 6230 vec_defs[reduc_index == 2 ? 1 : 2].release (); 6231 } 6232 } 6233 else 6234 { 6235 loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index], 6236 stmt); 6237 vec_oprnds0.quick_push (loop_vec_def0); 6238 if (op_type == ternary_op) 6239 { 6240 op1 = reduc_index == 0 ? ops[2] : ops[1]; 6241 loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt); 6242 vec_oprnds1.quick_push (loop_vec_def1); 6243 } 6244 } 6245 } 6246 else 6247 { 6248 if (!slp_node) 6249 { 6250 enum vect_def_type dt; 6251 gimple *dummy_stmt; 6252 6253 vect_is_simple_use (ops[!reduc_index], loop_vinfo, 6254 &dummy_stmt, &dt); 6255 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, 6256 loop_vec_def0); 6257 vec_oprnds0[0] = loop_vec_def0; 6258 if (op_type == ternary_op) 6259 { 6260 vect_is_simple_use (op1, loop_vinfo, &dummy_stmt, &dt); 6261 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt, 6262 loop_vec_def1); 6263 vec_oprnds1[0] = loop_vec_def1; 6264 } 6265 } 6266 6267 if (single_defuse_cycle) 6268 reduc_def = gimple_assign_lhs (new_stmt); 6269 6270 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi; 6271 } 6272 6273 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0) 6274 { 6275 if (slp_node) 6276 reduc_def = PHI_RESULT (phis[i]); 6277 else 6278 { 6279 if (!single_defuse_cycle || j == 0) 6280 reduc_def = PHI_RESULT (new_phi); 6281 } 6282 6283 def1 = ((op_type == ternary_op) 6284 ? vec_oprnds1[i] : NULL); 6285 if (op_type == binary_op) 6286 { 6287 if (reduc_index == 0) 6288 expr = build2 (code, vectype_out, reduc_def, def0); 6289 else 6290 expr = build2 (code, vectype_out, def0, reduc_def); 6291 } 6292 else 6293 { 6294 if (reduc_index == 0) 6295 expr = build3 (code, vectype_out, reduc_def, def0, def1); 6296 else 6297 { 6298 if (reduc_index == 1) 6299 expr = build3 (code, vectype_out, def0, reduc_def, def1); 6300 else 6301 expr = build3 (code, vectype_out, def0, def1, reduc_def); 6302 } 6303 } 6304 6305 new_stmt = gimple_build_assign (vec_dest, expr); 6306 new_temp = make_ssa_name (vec_dest, new_stmt); 6307 gimple_assign_set_lhs (new_stmt, new_temp); 6308 vect_finish_stmt_generation (stmt, new_stmt, gsi); 6309 6310 if (slp_node) 6311 { 6312 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt); 6313 vect_defs.quick_push (new_temp); 6314 } 6315 else 6316 vect_defs[0] = new_temp; 6317 } 6318 6319 if (slp_node) 6320 continue; 6321 6322 if (j == 0) 6323 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt; 6324 else 6325 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt; 6326 6327 prev_stmt_info = vinfo_for_stmt (new_stmt); 6328 prev_phi_info = vinfo_for_stmt (new_phi); 6329 } 6330 6331 tree indx_before_incr, indx_after_incr, cond_name = NULL; 6332 6333 /* Finalize the reduction-phi (set its arguments) and create the 6334 epilog reduction code. */ 6335 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node) 6336 { 6337 new_temp = gimple_assign_lhs (*vec_stmt); 6338 vect_defs[0] = new_temp; 6339 6340 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR) 6341 which is updated with the current index of the loop for every match of 6342 the original loop's cond_expr (VEC_STMT). This results in a vector 6343 containing the last time the condition passed for that vector lane. 6344 The first match will be a 1 to allow 0 to be used for non-matching 6345 indexes. If there are no matches at all then the vector will be all 6346 zeroes. */ 6347 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION) 6348 { 6349 int nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out); 6350 int k; 6351 6352 gcc_assert (gimple_assign_rhs_code (*vec_stmt) == VEC_COND_EXPR); 6353 6354 /* First we create a simple vector induction variable which starts 6355 with the values {1,2,3,...} (SERIES_VECT) and increments by the 6356 vector size (STEP). */ 6357 6358 /* Create a {1,2,3,...} vector. */ 6359 tree *vtemp = XALLOCAVEC (tree, nunits_out); 6360 for (k = 0; k < nunits_out; ++k) 6361 vtemp[k] = build_int_cst (cr_index_scalar_type, k + 1); 6362 tree series_vect = build_vector (cr_index_vector_type, vtemp); 6363 6364 /* Create a vector of the step value. */ 6365 tree step = build_int_cst (cr_index_scalar_type, nunits_out); 6366 tree vec_step = build_vector_from_val (cr_index_vector_type, step); 6367 6368 /* Create an induction variable. */ 6369 gimple_stmt_iterator incr_gsi; 6370 bool insert_after; 6371 standard_iv_increment_position (loop, &incr_gsi, &insert_after); 6372 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi, 6373 insert_after, &indx_before_incr, &indx_after_incr); 6374 6375 /* Next create a new phi node vector (NEW_PHI_TREE) which starts 6376 filled with zeros (VEC_ZERO). */ 6377 6378 /* Create a vector of 0s. */ 6379 tree zero = build_zero_cst (cr_index_scalar_type); 6380 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero); 6381 6382 /* Create a vector phi node. */ 6383 tree new_phi_tree = make_ssa_name (cr_index_vector_type); 6384 new_phi = create_phi_node (new_phi_tree, loop->header); 6385 set_vinfo_for_stmt (new_phi, 6386 new_stmt_vec_info (new_phi, loop_vinfo)); 6387 add_phi_arg (new_phi, vec_zero, loop_preheader_edge (loop), 6388 UNKNOWN_LOCATION); 6389 6390 /* Now take the condition from the loops original cond_expr 6391 (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for 6392 every match uses values from the induction variable 6393 (INDEX_BEFORE_INCR) otherwise uses values from the phi node 6394 (NEW_PHI_TREE). 6395 Finally, we update the phi (NEW_PHI_TREE) to take the value of 6396 the new cond_expr (INDEX_COND_EXPR). */ 6397 6398 /* Duplicate the condition from vec_stmt. */ 6399 tree ccompare = unshare_expr (gimple_assign_rhs1 (*vec_stmt)); 6400 6401 /* Create a conditional, where the condition is taken from vec_stmt 6402 (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and 6403 else is the phi (NEW_PHI_TREE). */ 6404 tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type, 6405 ccompare, indx_before_incr, 6406 new_phi_tree); 6407 cond_name = make_ssa_name (cr_index_vector_type); 6408 gimple *index_condition = gimple_build_assign (cond_name, 6409 index_cond_expr); 6410 gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT); 6411 stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition, 6412 loop_vinfo); 6413 STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type; 6414 set_vinfo_for_stmt (index_condition, index_vec_info); 6415 6416 /* Update the phi with the vec cond. */ 6417 add_phi_arg (new_phi, cond_name, loop_latch_edge (loop), 6418 UNKNOWN_LOCATION); 6419 } 6420 } 6421 6422 vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies, 6423 epilog_reduc_code, phis, reduc_index, 6424 double_reduc, slp_node, cond_name, 6425 cond_reduc_val); 6426 6427 return true; 6428 } 6429 6430 /* Function vect_min_worthwhile_factor. 6431 6432 For a loop where we could vectorize the operation indicated by CODE, 6433 return the minimum vectorization factor that makes it worthwhile 6434 to use generic vectors. */ 6435 int 6436 vect_min_worthwhile_factor (enum tree_code code) 6437 { 6438 switch (code) 6439 { 6440 case PLUS_EXPR: 6441 case MINUS_EXPR: 6442 case NEGATE_EXPR: 6443 return 4; 6444 6445 case BIT_AND_EXPR: 6446 case BIT_IOR_EXPR: 6447 case BIT_XOR_EXPR: 6448 case BIT_NOT_EXPR: 6449 return 2; 6450 6451 default: 6452 return INT_MAX; 6453 } 6454 } 6455 6456 6457 /* Function vectorizable_induction 6458 6459 Check if PHI performs an induction computation that can be vectorized. 6460 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized 6461 phi to replace it, put it in VEC_STMT, and add it to the same basic block. 6462 Return FALSE if not a vectorizable STMT, TRUE otherwise. */ 6463 6464 bool 6465 vectorizable_induction (gimple *phi, 6466 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED, 6467 gimple **vec_stmt) 6468 { 6469 stmt_vec_info stmt_info = vinfo_for_stmt (phi); 6470 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 6471 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 6472 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 6473 int nunits = TYPE_VECTOR_SUBPARTS (vectype); 6474 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits; 6475 tree vec_def; 6476 6477 gcc_assert (ncopies >= 1); 6478 /* FORNOW. These restrictions should be relaxed. */ 6479 if (nested_in_vect_loop_p (loop, phi)) 6480 { 6481 imm_use_iterator imm_iter; 6482 use_operand_p use_p; 6483 gimple *exit_phi; 6484 edge latch_e; 6485 tree loop_arg; 6486 6487 if (ncopies > 1) 6488 { 6489 if (dump_enabled_p ()) 6490 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6491 "multiple types in nested loop.\n"); 6492 return false; 6493 } 6494 6495 exit_phi = NULL; 6496 latch_e = loop_latch_edge (loop->inner); 6497 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e); 6498 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg) 6499 { 6500 gimple *use_stmt = USE_STMT (use_p); 6501 if (is_gimple_debug (use_stmt)) 6502 continue; 6503 6504 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt))) 6505 { 6506 exit_phi = use_stmt; 6507 break; 6508 } 6509 } 6510 if (exit_phi) 6511 { 6512 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi); 6513 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo) 6514 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))) 6515 { 6516 if (dump_enabled_p ()) 6517 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6518 "inner-loop induction only used outside " 6519 "of the outer vectorized loop.\n"); 6520 return false; 6521 } 6522 } 6523 } 6524 6525 if (!STMT_VINFO_RELEVANT_P (stmt_info)) 6526 return false; 6527 6528 /* FORNOW: SLP not supported. */ 6529 if (STMT_SLP_TYPE (stmt_info)) 6530 return false; 6531 6532 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def); 6533 6534 if (gimple_code (phi) != GIMPLE_PHI) 6535 return false; 6536 6537 if (!vec_stmt) /* transformation not required. */ 6538 { 6539 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type; 6540 if (dump_enabled_p ()) 6541 dump_printf_loc (MSG_NOTE, vect_location, 6542 "=== vectorizable_induction ===\n"); 6543 vect_model_induction_cost (stmt_info, ncopies); 6544 return true; 6545 } 6546 6547 /** Transform. **/ 6548 6549 if (dump_enabled_p ()) 6550 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n"); 6551 6552 vec_def = get_initial_def_for_induction (phi); 6553 *vec_stmt = SSA_NAME_DEF_STMT (vec_def); 6554 return true; 6555 } 6556 6557 /* Function vectorizable_live_operation. 6558 6559 STMT computes a value that is used outside the loop. Check if 6560 it can be supported. */ 6561 6562 bool 6563 vectorizable_live_operation (gimple *stmt, 6564 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED, 6565 slp_tree slp_node, int slp_index, 6566 gimple **vec_stmt) 6567 { 6568 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 6569 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 6570 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 6571 imm_use_iterator imm_iter; 6572 tree lhs, lhs_type, bitsize, vec_bitsize; 6573 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 6574 int nunits = TYPE_VECTOR_SUBPARTS (vectype); 6575 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits; 6576 gimple *use_stmt; 6577 auto_vec<tree> vec_oprnds; 6578 6579 gcc_assert (STMT_VINFO_LIVE_P (stmt_info)); 6580 6581 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def) 6582 return false; 6583 6584 /* FORNOW. CHECKME. */ 6585 if (nested_in_vect_loop_p (loop, stmt)) 6586 return false; 6587 6588 /* If STMT is not relevant and it is a simple assignment and its inputs are 6589 invariant then it can remain in place, unvectorized. The original last 6590 scalar value that it computes will be used. */ 6591 if (!STMT_VINFO_RELEVANT_P (stmt_info)) 6592 { 6593 gcc_assert (is_simple_and_all_uses_invariant (stmt, loop_vinfo)); 6594 if (dump_enabled_p ()) 6595 dump_printf_loc (MSG_NOTE, vect_location, 6596 "statement is simple and uses invariant. Leaving in " 6597 "place.\n"); 6598 return true; 6599 } 6600 6601 if (!vec_stmt) 6602 /* No transformation required. */ 6603 return true; 6604 6605 /* If stmt has a related stmt, then use that for getting the lhs. */ 6606 if (is_pattern_stmt_p (stmt_info)) 6607 stmt = STMT_VINFO_RELATED_STMT (stmt_info); 6608 6609 lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt) 6610 : gimple_get_lhs (stmt); 6611 lhs_type = TREE_TYPE (lhs); 6612 6613 bitsize = TYPE_SIZE (TREE_TYPE (vectype)); 6614 vec_bitsize = TYPE_SIZE (vectype); 6615 6616 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */ 6617 tree vec_lhs, bitstart; 6618 if (slp_node) 6619 { 6620 gcc_assert (slp_index >= 0); 6621 6622 int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length (); 6623 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); 6624 6625 /* Get the last occurrence of the scalar index from the concatenation of 6626 all the slp vectors. Calculate which slp vector it is and the index 6627 within. */ 6628 int pos = (num_vec * nunits) - num_scalar + slp_index; 6629 int vec_entry = pos / nunits; 6630 int vec_index = pos % nunits; 6631 6632 /* Get the correct slp vectorized stmt. */ 6633 vec_lhs = gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[vec_entry]); 6634 6635 /* Get entry to use. */ 6636 bitstart = build_int_cst (unsigned_type_node, vec_index); 6637 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart); 6638 } 6639 else 6640 { 6641 enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info); 6642 vec_lhs = vect_get_vec_def_for_operand_1 (stmt, dt); 6643 6644 /* For multiple copies, get the last copy. */ 6645 for (int i = 1; i < ncopies; ++i) 6646 vec_lhs = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, 6647 vec_lhs); 6648 6649 /* Get the last lane in the vector. */ 6650 bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize); 6651 } 6652 6653 /* Create a new vectorized stmt for the uses of STMT and insert outside the 6654 loop. */ 6655 gimple_seq stmts = NULL; 6656 tree bftype = TREE_TYPE (vectype); 6657 if (VECTOR_BOOLEAN_TYPE_P (vectype)) 6658 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1); 6659 tree new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart); 6660 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree), &stmts, 6661 true, NULL_TREE); 6662 if (stmts) 6663 gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts); 6664 6665 /* Replace use of lhs with newly computed result. If the use stmt is a 6666 single arg PHI, just replace all uses of PHI result. It's necessary 6667 because lcssa PHI defining lhs may be before newly inserted stmt. */ 6668 use_operand_p use_p; 6669 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs) 6670 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)) 6671 && !is_gimple_debug (use_stmt)) 6672 { 6673 if (gimple_code (use_stmt) == GIMPLE_PHI 6674 && gimple_phi_num_args (use_stmt) == 1) 6675 { 6676 replace_uses_by (gimple_phi_result (use_stmt), new_tree); 6677 } 6678 else 6679 { 6680 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter) 6681 SET_USE (use_p, new_tree); 6682 } 6683 update_stmt (use_stmt); 6684 } 6685 6686 return true; 6687 } 6688 6689 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */ 6690 6691 static void 6692 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt) 6693 { 6694 ssa_op_iter op_iter; 6695 imm_use_iterator imm_iter; 6696 def_operand_p def_p; 6697 gimple *ustmt; 6698 6699 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF) 6700 { 6701 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p)) 6702 { 6703 basic_block bb; 6704 6705 if (!is_gimple_debug (ustmt)) 6706 continue; 6707 6708 bb = gimple_bb (ustmt); 6709 6710 if (!flow_bb_inside_loop_p (loop, bb)) 6711 { 6712 if (gimple_debug_bind_p (ustmt)) 6713 { 6714 if (dump_enabled_p ()) 6715 dump_printf_loc (MSG_NOTE, vect_location, 6716 "killing debug use\n"); 6717 6718 gimple_debug_bind_reset_value (ustmt); 6719 update_stmt (ustmt); 6720 } 6721 else 6722 gcc_unreachable (); 6723 } 6724 } 6725 } 6726 } 6727 6728 /* Given loop represented by LOOP_VINFO, return true if computation of 6729 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false 6730 otherwise. */ 6731 6732 static bool 6733 loop_niters_no_overflow (loop_vec_info loop_vinfo) 6734 { 6735 /* Constant case. */ 6736 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) 6737 { 6738 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo); 6739 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo); 6740 6741 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST); 6742 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST); 6743 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters)) 6744 return true; 6745 } 6746 6747 widest_int max; 6748 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 6749 /* Check the upper bound of loop niters. */ 6750 if (get_max_loop_iterations (loop, &max)) 6751 { 6752 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)); 6753 signop sgn = TYPE_SIGN (type); 6754 widest_int type_max = widest_int::from (wi::max_value (type), sgn); 6755 if (max < type_max) 6756 return true; 6757 } 6758 return false; 6759 } 6760 6761 /* Scale profiling counters by estimation for LOOP which is vectorized 6762 by factor VF. */ 6763 6764 static void 6765 scale_profile_for_vect_loop (struct loop *loop, unsigned vf) 6766 { 6767 edge preheader = loop_preheader_edge (loop); 6768 /* Reduce loop iterations by the vectorization factor. */ 6769 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf); 6770 gcov_type freq_h = loop->header->count, freq_e = preheader->count; 6771 6772 /* Use frequency only if counts are zero. */ 6773 if (freq_h == 0 && freq_e == 0) 6774 { 6775 freq_h = loop->header->frequency; 6776 freq_e = EDGE_FREQUENCY (preheader); 6777 } 6778 if (freq_h != 0) 6779 { 6780 gcov_type scale; 6781 6782 /* Avoid dropping loop body profile counter to 0 because of zero count 6783 in loop's preheader. */ 6784 freq_e = MAX (freq_e, 1); 6785 /* This should not overflow. */ 6786 scale = GCOV_COMPUTE_SCALE (freq_e * (new_est_niter + 1), freq_h); 6787 scale_loop_frequencies (loop, scale, REG_BR_PROB_BASE); 6788 } 6789 6790 basic_block exit_bb = single_pred (loop->latch); 6791 edge exit_e = single_exit (loop); 6792 exit_e->count = loop_preheader_edge (loop)->count; 6793 exit_e->probability = REG_BR_PROB_BASE / (new_est_niter + 1); 6794 6795 edge exit_l = single_pred_edge (loop->latch); 6796 int prob = exit_l->probability; 6797 exit_l->probability = REG_BR_PROB_BASE - exit_e->probability; 6798 exit_l->count = exit_bb->count - exit_e->count; 6799 if (exit_l->count < 0) 6800 exit_l->count = 0; 6801 if (prob > 0) 6802 scale_bbs_frequencies_int (&loop->latch, 1, exit_l->probability, prob); 6803 } 6804 6805 /* Function vect_transform_loop. 6806 6807 The analysis phase has determined that the loop is vectorizable. 6808 Vectorize the loop - created vectorized stmts to replace the scalar 6809 stmts in the loop, and update the loop exit condition. 6810 Returns scalar epilogue loop if any. */ 6811 6812 struct loop * 6813 vect_transform_loop (loop_vec_info loop_vinfo) 6814 { 6815 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 6816 struct loop *epilogue = NULL; 6817 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); 6818 int nbbs = loop->num_nodes; 6819 int i; 6820 tree niters_vector = NULL; 6821 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 6822 bool grouped_store; 6823 bool slp_scheduled = false; 6824 gimple *stmt, *pattern_stmt; 6825 gimple_seq pattern_def_seq = NULL; 6826 gimple_stmt_iterator pattern_def_si = gsi_none (); 6827 bool transform_pattern_stmt = false; 6828 bool check_profitability = false; 6829 int th; 6830 6831 if (dump_enabled_p ()) 6832 dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n"); 6833 6834 /* Use the more conservative vectorization threshold. If the number 6835 of iterations is constant assume the cost check has been performed 6836 by our caller. If the threshold makes all loops profitable that 6837 run at least the vectorization factor number of times checking 6838 is pointless, too. */ 6839 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo); 6840 if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1 6841 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) 6842 { 6843 if (dump_enabled_p ()) 6844 dump_printf_loc (MSG_NOTE, vect_location, 6845 "Profitability threshold is %d loop iterations.\n", 6846 th); 6847 check_profitability = true; 6848 } 6849 6850 /* Make sure there exists a single-predecessor exit bb. Do this before 6851 versioning. */ 6852 edge e = single_exit (loop); 6853 if (! single_pred_p (e->dest)) 6854 { 6855 split_loop_exit_edge (e); 6856 if (dump_enabled_p ()) 6857 dump_printf (MSG_NOTE, "split exit edge\n"); 6858 } 6859 6860 /* Version the loop first, if required, so the profitability check 6861 comes first. */ 6862 6863 if (LOOP_REQUIRES_VERSIONING (loop_vinfo)) 6864 { 6865 vect_loop_versioning (loop_vinfo, th, check_profitability); 6866 check_profitability = false; 6867 } 6868 6869 /* Make sure there exists a single-predecessor exit bb also on the 6870 scalar loop copy. Do this after versioning but before peeling 6871 so CFG structure is fine for both scalar and if-converted loop 6872 to make slpeel_duplicate_current_defs_from_edges face matched 6873 loop closed PHI nodes on the exit. */ 6874 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)) 6875 { 6876 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)); 6877 if (! single_pred_p (e->dest)) 6878 { 6879 split_loop_exit_edge (e); 6880 if (dump_enabled_p ()) 6881 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n"); 6882 } 6883 } 6884 6885 tree niters = vect_build_loop_niters (loop_vinfo); 6886 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters; 6887 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo)); 6888 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo); 6889 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector, th, 6890 check_profitability, niters_no_overflow); 6891 if (niters_vector == NULL_TREE) 6892 { 6893 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) 6894 niters_vector 6895 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)), 6896 LOOP_VINFO_INT_NITERS (loop_vinfo) / vf); 6897 else 6898 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector, 6899 niters_no_overflow); 6900 } 6901 6902 /* 1) Make sure the loop header has exactly two entries 6903 2) Make sure we have a preheader basic block. */ 6904 6905 gcc_assert (EDGE_COUNT (loop->header->preds) == 2); 6906 6907 split_edge (loop_preheader_edge (loop)); 6908 6909 /* FORNOW: the vectorizer supports only loops which body consist 6910 of one basic block (header + empty latch). When the vectorizer will 6911 support more involved loop forms, the order by which the BBs are 6912 traversed need to be reconsidered. */ 6913 6914 for (i = 0; i < nbbs; i++) 6915 { 6916 basic_block bb = bbs[i]; 6917 stmt_vec_info stmt_info; 6918 6919 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si); 6920 gsi_next (&si)) 6921 { 6922 gphi *phi = si.phi (); 6923 if (dump_enabled_p ()) 6924 { 6925 dump_printf_loc (MSG_NOTE, vect_location, 6926 "------>vectorizing phi: "); 6927 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0); 6928 } 6929 stmt_info = vinfo_for_stmt (phi); 6930 if (!stmt_info) 6931 continue; 6932 6933 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info)) 6934 vect_loop_kill_debug_uses (loop, phi); 6935 6936 if (!STMT_VINFO_RELEVANT_P (stmt_info) 6937 && !STMT_VINFO_LIVE_P (stmt_info)) 6938 continue; 6939 6940 if (STMT_VINFO_VECTYPE (stmt_info) 6941 && (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)) 6942 != (unsigned HOST_WIDE_INT) vf) 6943 && dump_enabled_p ()) 6944 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n"); 6945 6946 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def) 6947 { 6948 if (dump_enabled_p ()) 6949 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n"); 6950 vect_transform_stmt (phi, NULL, NULL, NULL, NULL); 6951 } 6952 } 6953 6954 pattern_stmt = NULL; 6955 for (gimple_stmt_iterator si = gsi_start_bb (bb); 6956 !gsi_end_p (si) || transform_pattern_stmt;) 6957 { 6958 bool is_store; 6959 6960 if (transform_pattern_stmt) 6961 stmt = pattern_stmt; 6962 else 6963 { 6964 stmt = gsi_stmt (si); 6965 /* During vectorization remove existing clobber stmts. */ 6966 if (gimple_clobber_p (stmt)) 6967 { 6968 unlink_stmt_vdef (stmt); 6969 gsi_remove (&si, true); 6970 release_defs (stmt); 6971 continue; 6972 } 6973 } 6974 6975 if (dump_enabled_p ()) 6976 { 6977 dump_printf_loc (MSG_NOTE, vect_location, 6978 "------>vectorizing statement: "); 6979 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0); 6980 } 6981 6982 stmt_info = vinfo_for_stmt (stmt); 6983 6984 /* vector stmts created in the outer-loop during vectorization of 6985 stmts in an inner-loop may not have a stmt_info, and do not 6986 need to be vectorized. */ 6987 if (!stmt_info) 6988 { 6989 gsi_next (&si); 6990 continue; 6991 } 6992 6993 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info)) 6994 vect_loop_kill_debug_uses (loop, stmt); 6995 6996 if (!STMT_VINFO_RELEVANT_P (stmt_info) 6997 && !STMT_VINFO_LIVE_P (stmt_info)) 6998 { 6999 if (STMT_VINFO_IN_PATTERN_P (stmt_info) 7000 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info)) 7001 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt)) 7002 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt)))) 7003 { 7004 stmt = pattern_stmt; 7005 stmt_info = vinfo_for_stmt (stmt); 7006 } 7007 else 7008 { 7009 gsi_next (&si); 7010 continue; 7011 } 7012 } 7013 else if (STMT_VINFO_IN_PATTERN_P (stmt_info) 7014 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info)) 7015 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt)) 7016 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt)))) 7017 transform_pattern_stmt = true; 7018 7019 /* If pattern statement has def stmts, vectorize them too. */ 7020 if (is_pattern_stmt_p (stmt_info)) 7021 { 7022 if (pattern_def_seq == NULL) 7023 { 7024 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info); 7025 pattern_def_si = gsi_start (pattern_def_seq); 7026 } 7027 else if (!gsi_end_p (pattern_def_si)) 7028 gsi_next (&pattern_def_si); 7029 if (pattern_def_seq != NULL) 7030 { 7031 gimple *pattern_def_stmt = NULL; 7032 stmt_vec_info pattern_def_stmt_info = NULL; 7033 7034 while (!gsi_end_p (pattern_def_si)) 7035 { 7036 pattern_def_stmt = gsi_stmt (pattern_def_si); 7037 pattern_def_stmt_info 7038 = vinfo_for_stmt (pattern_def_stmt); 7039 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info) 7040 || STMT_VINFO_LIVE_P (pattern_def_stmt_info)) 7041 break; 7042 gsi_next (&pattern_def_si); 7043 } 7044 7045 if (!gsi_end_p (pattern_def_si)) 7046 { 7047 if (dump_enabled_p ()) 7048 { 7049 dump_printf_loc (MSG_NOTE, vect_location, 7050 "==> vectorizing pattern def " 7051 "stmt: "); 7052 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, 7053 pattern_def_stmt, 0); 7054 } 7055 7056 stmt = pattern_def_stmt; 7057 stmt_info = pattern_def_stmt_info; 7058 } 7059 else 7060 { 7061 pattern_def_si = gsi_none (); 7062 transform_pattern_stmt = false; 7063 } 7064 } 7065 else 7066 transform_pattern_stmt = false; 7067 } 7068 7069 if (STMT_VINFO_VECTYPE (stmt_info)) 7070 { 7071 unsigned int nunits 7072 = (unsigned int) 7073 TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)); 7074 if (!STMT_SLP_TYPE (stmt_info) 7075 && nunits != (unsigned int) vf 7076 && dump_enabled_p ()) 7077 /* For SLP VF is set according to unrolling factor, and not 7078 to vector size, hence for SLP this print is not valid. */ 7079 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n"); 7080 } 7081 7082 /* SLP. Schedule all the SLP instances when the first SLP stmt is 7083 reached. */ 7084 if (STMT_SLP_TYPE (stmt_info)) 7085 { 7086 if (!slp_scheduled) 7087 { 7088 slp_scheduled = true; 7089 7090 if (dump_enabled_p ()) 7091 dump_printf_loc (MSG_NOTE, vect_location, 7092 "=== scheduling SLP instances ===\n"); 7093 7094 vect_schedule_slp (loop_vinfo); 7095 } 7096 7097 /* Hybrid SLP stmts must be vectorized in addition to SLP. */ 7098 if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info)) 7099 { 7100 if (!transform_pattern_stmt && gsi_end_p (pattern_def_si)) 7101 { 7102 pattern_def_seq = NULL; 7103 gsi_next (&si); 7104 } 7105 continue; 7106 } 7107 } 7108 7109 /* -------- vectorize statement ------------ */ 7110 if (dump_enabled_p ()) 7111 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n"); 7112 7113 grouped_store = false; 7114 is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL); 7115 if (is_store) 7116 { 7117 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) 7118 { 7119 /* Interleaving. If IS_STORE is TRUE, the vectorization of the 7120 interleaving chain was completed - free all the stores in 7121 the chain. */ 7122 gsi_next (&si); 7123 vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info)); 7124 } 7125 else 7126 { 7127 /* Free the attached stmt_vec_info and remove the stmt. */ 7128 gimple *store = gsi_stmt (si); 7129 free_stmt_vec_info (store); 7130 unlink_stmt_vdef (store); 7131 gsi_remove (&si, true); 7132 release_defs (store); 7133 } 7134 7135 /* Stores can only appear at the end of pattern statements. */ 7136 gcc_assert (!transform_pattern_stmt); 7137 pattern_def_seq = NULL; 7138 } 7139 else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si)) 7140 { 7141 pattern_def_seq = NULL; 7142 gsi_next (&si); 7143 } 7144 } /* stmts in BB */ 7145 } /* BBs in loop */ 7146 7147 slpeel_make_loop_iterate_ntimes (loop, niters_vector); 7148 7149 scale_profile_for_vect_loop (loop, vf); 7150 7151 /* The minimum number of iterations performed by the epilogue. This 7152 is 1 when peeling for gaps because we always need a final scalar 7153 iteration. */ 7154 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0; 7155 /* +1 to convert latch counts to loop iteration counts, 7156 -min_epilogue_iters to remove iterations that cannot be performed 7157 by the vector code. */ 7158 int bias = 1 - min_epilogue_iters; 7159 /* In these calculations the "- 1" converts loop iteration counts 7160 back to latch counts. */ 7161 if (loop->any_upper_bound) 7162 loop->nb_iterations_upper_bound 7163 = wi::udiv_floor (loop->nb_iterations_upper_bound + bias, vf) - 1; 7164 if (loop->any_likely_upper_bound) 7165 loop->nb_iterations_likely_upper_bound 7166 = wi::udiv_floor (loop->nb_iterations_likely_upper_bound + bias, vf) - 1; 7167 if (loop->any_estimate) 7168 loop->nb_iterations_estimate 7169 = wi::udiv_floor (loop->nb_iterations_estimate + bias, vf) - 1; 7170 7171 if (dump_enabled_p ()) 7172 { 7173 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)) 7174 { 7175 dump_printf_loc (MSG_NOTE, vect_location, 7176 "LOOP VECTORIZED\n"); 7177 if (loop->inner) 7178 dump_printf_loc (MSG_NOTE, vect_location, 7179 "OUTER LOOP VECTORIZED\n"); 7180 dump_printf (MSG_NOTE, "\n"); 7181 } 7182 else 7183 dump_printf_loc (MSG_NOTE, vect_location, 7184 "LOOP EPILOGUE VECTORIZED (VS=%d)\n", 7185 current_vector_size); 7186 } 7187 7188 /* Free SLP instances here because otherwise stmt reference counting 7189 won't work. */ 7190 slp_instance instance; 7191 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance) 7192 vect_free_slp_instance (instance); 7193 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release (); 7194 /* Clear-up safelen field since its value is invalid after vectorization 7195 since vectorized loop can have loop-carried dependencies. */ 7196 loop->safelen = 0; 7197 7198 /* Don't vectorize epilogue for epilogue. */ 7199 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)) 7200 epilogue = NULL; 7201 7202 if (epilogue) 7203 { 7204 unsigned int vector_sizes 7205 = targetm.vectorize.autovectorize_vector_sizes (); 7206 vector_sizes &= current_vector_size - 1; 7207 7208 if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK)) 7209 epilogue = NULL; 7210 else if (!vector_sizes) 7211 epilogue = NULL; 7212 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) 7213 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0) 7214 { 7215 int smallest_vec_size = 1 << ctz_hwi (vector_sizes); 7216 int ratio = current_vector_size / smallest_vec_size; 7217 int eiters = LOOP_VINFO_INT_NITERS (loop_vinfo) 7218 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); 7219 eiters = eiters % vf; 7220 7221 epilogue->nb_iterations_upper_bound = eiters - 1; 7222 7223 if (eiters < vf / ratio) 7224 epilogue = NULL; 7225 } 7226 } 7227 7228 if (epilogue) 7229 { 7230 epilogue->force_vectorize = loop->force_vectorize; 7231 epilogue->safelen = loop->safelen; 7232 epilogue->dont_vectorize = false; 7233 7234 /* We may need to if-convert epilogue to vectorize it. */ 7235 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)) 7236 tree_if_conversion (epilogue); 7237 } 7238 7239 return epilogue; 7240 } 7241 7242 /* The code below is trying to perform simple optimization - revert 7243 if-conversion for masked stores, i.e. if the mask of a store is zero 7244 do not perform it and all stored value producers also if possible. 7245 For example, 7246 for (i=0; i<n; i++) 7247 if (c[i]) 7248 { 7249 p1[i] += 1; 7250 p2[i] = p3[i] +2; 7251 } 7252 this transformation will produce the following semi-hammock: 7253 7254 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 }) 7255 { 7256 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165); 7257 vect__12.22_172 = vect__11.19_170 + vect_cst__171; 7258 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172); 7259 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165); 7260 vect__19.28_184 = vect__18.25_182 + vect_cst__183; 7261 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184); 7262 } 7263 */ 7264 7265 void 7266 optimize_mask_stores (struct loop *loop) 7267 { 7268 basic_block *bbs = get_loop_body (loop); 7269 unsigned nbbs = loop->num_nodes; 7270 unsigned i; 7271 basic_block bb; 7272 struct loop *bb_loop; 7273 gimple_stmt_iterator gsi; 7274 gimple *stmt; 7275 auto_vec<gimple *> worklist; 7276 7277 vect_location = find_loop_location (loop); 7278 /* Pick up all masked stores in loop if any. */ 7279 for (i = 0; i < nbbs; i++) 7280 { 7281 bb = bbs[i]; 7282 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); 7283 gsi_next (&gsi)) 7284 { 7285 stmt = gsi_stmt (gsi); 7286 if (gimple_call_internal_p (stmt, IFN_MASK_STORE)) 7287 worklist.safe_push (stmt); 7288 } 7289 } 7290 7291 free (bbs); 7292 if (worklist.is_empty ()) 7293 return; 7294 7295 /* Loop has masked stores. */ 7296 while (!worklist.is_empty ()) 7297 { 7298 gimple *last, *last_store; 7299 edge e, efalse; 7300 tree mask; 7301 basic_block store_bb, join_bb; 7302 gimple_stmt_iterator gsi_to; 7303 tree vdef, new_vdef; 7304 gphi *phi; 7305 tree vectype; 7306 tree zero; 7307 7308 last = worklist.pop (); 7309 mask = gimple_call_arg (last, 2); 7310 bb = gimple_bb (last); 7311 /* Create then_bb and if-then structure in CFG, then_bb belongs to 7312 the same loop as if_bb. It could be different to LOOP when two 7313 level loop-nest is vectorized and mask_store belongs to the inner 7314 one. */ 7315 e = split_block (bb, last); 7316 bb_loop = bb->loop_father; 7317 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop)); 7318 join_bb = e->dest; 7319 store_bb = create_empty_bb (bb); 7320 add_bb_to_loop (store_bb, bb_loop); 7321 e->flags = EDGE_TRUE_VALUE; 7322 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE); 7323 /* Put STORE_BB to likely part. */ 7324 efalse->probability = PROB_UNLIKELY; 7325 store_bb->frequency = PROB_ALWAYS - EDGE_FREQUENCY (efalse); 7326 make_edge (store_bb, join_bb, EDGE_FALLTHRU); 7327 if (dom_info_available_p (CDI_DOMINATORS)) 7328 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb); 7329 if (dump_enabled_p ()) 7330 dump_printf_loc (MSG_NOTE, vect_location, 7331 "Create new block %d to sink mask stores.", 7332 store_bb->index); 7333 /* Create vector comparison with boolean result. */ 7334 vectype = TREE_TYPE (mask); 7335 zero = build_zero_cst (vectype); 7336 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE); 7337 gsi = gsi_last_bb (bb); 7338 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT); 7339 /* Create new PHI node for vdef of the last masked store: 7340 .MEM_2 = VDEF <.MEM_1> 7341 will be converted to 7342 .MEM.3 = VDEF <.MEM_1> 7343 and new PHI node will be created in join bb 7344 .MEM_2 = PHI <.MEM_1, .MEM_3> 7345 */ 7346 vdef = gimple_vdef (last); 7347 new_vdef = make_ssa_name (gimple_vop (cfun), last); 7348 gimple_set_vdef (last, new_vdef); 7349 phi = create_phi_node (vdef, join_bb); 7350 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION); 7351 7352 /* Put all masked stores with the same mask to STORE_BB if possible. */ 7353 while (true) 7354 { 7355 gimple_stmt_iterator gsi_from; 7356 gimple *stmt1 = NULL; 7357 7358 /* Move masked store to STORE_BB. */ 7359 last_store = last; 7360 gsi = gsi_for_stmt (last); 7361 gsi_from = gsi; 7362 /* Shift GSI to the previous stmt for further traversal. */ 7363 gsi_prev (&gsi); 7364 gsi_to = gsi_start_bb (store_bb); 7365 gsi_move_before (&gsi_from, &gsi_to); 7366 /* Setup GSI_TO to the non-empty block start. */ 7367 gsi_to = gsi_start_bb (store_bb); 7368 if (dump_enabled_p ()) 7369 { 7370 dump_printf_loc (MSG_NOTE, vect_location, 7371 "Move stmt to created bb\n"); 7372 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0); 7373 } 7374 /* Move all stored value producers if possible. */ 7375 while (!gsi_end_p (gsi)) 7376 { 7377 tree lhs; 7378 imm_use_iterator imm_iter; 7379 use_operand_p use_p; 7380 bool res; 7381 7382 /* Skip debug statements. */ 7383 if (is_gimple_debug (gsi_stmt (gsi))) 7384 { 7385 gsi_prev (&gsi); 7386 continue; 7387 } 7388 stmt1 = gsi_stmt (gsi); 7389 /* Do not consider statements writing to memory or having 7390 volatile operand. */ 7391 if (gimple_vdef (stmt1) 7392 || gimple_has_volatile_ops (stmt1)) 7393 break; 7394 gsi_from = gsi; 7395 gsi_prev (&gsi); 7396 lhs = gimple_get_lhs (stmt1); 7397 if (!lhs) 7398 break; 7399 7400 /* LHS of vectorized stmt must be SSA_NAME. */ 7401 if (TREE_CODE (lhs) != SSA_NAME) 7402 break; 7403 7404 if (!VECTOR_TYPE_P (TREE_TYPE (lhs))) 7405 { 7406 /* Remove dead scalar statement. */ 7407 if (has_zero_uses (lhs)) 7408 { 7409 gsi_remove (&gsi_from, true); 7410 continue; 7411 } 7412 } 7413 7414 /* Check that LHS does not have uses outside of STORE_BB. */ 7415 res = true; 7416 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs) 7417 { 7418 gimple *use_stmt; 7419 use_stmt = USE_STMT (use_p); 7420 if (is_gimple_debug (use_stmt)) 7421 continue; 7422 if (gimple_bb (use_stmt) != store_bb) 7423 { 7424 res = false; 7425 break; 7426 } 7427 } 7428 if (!res) 7429 break; 7430 7431 if (gimple_vuse (stmt1) 7432 && gimple_vuse (stmt1) != gimple_vuse (last_store)) 7433 break; 7434 7435 /* Can move STMT1 to STORE_BB. */ 7436 if (dump_enabled_p ()) 7437 { 7438 dump_printf_loc (MSG_NOTE, vect_location, 7439 "Move stmt to created bb\n"); 7440 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0); 7441 } 7442 gsi_move_before (&gsi_from, &gsi_to); 7443 /* Shift GSI_TO for further insertion. */ 7444 gsi_prev (&gsi_to); 7445 } 7446 /* Put other masked stores with the same mask to STORE_BB. */ 7447 if (worklist.is_empty () 7448 || gimple_call_arg (worklist.last (), 2) != mask 7449 || worklist.last () != stmt1) 7450 break; 7451 last = worklist.pop (); 7452 } 7453 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION); 7454 } 7455 } 7456