1 /* Loop Vectorization 2 Copyright (C) 2003-2018 Free Software Foundation, Inc. 3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and 4 Ira Rosen <irar@il.ibm.com> 5 6 This file is part of GCC. 7 8 GCC is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free 10 Software Foundation; either version 3, or (at your option) any later 11 version. 12 13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or 15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 16 for more details. 17 18 You should have received a copy of the GNU General Public License 19 along with GCC; see the file COPYING3. If not see 20 <http://www.gnu.org/licenses/>. */ 21 22 #include "config.h" 23 #include "system.h" 24 #include "coretypes.h" 25 #include "backend.h" 26 #include "target.h" 27 #include "rtl.h" 28 #include "tree.h" 29 #include "gimple.h" 30 #include "cfghooks.h" 31 #include "tree-pass.h" 32 #include "ssa.h" 33 #include "optabs-tree.h" 34 #include "diagnostic-core.h" 35 #include "fold-const.h" 36 #include "stor-layout.h" 37 #include "cfganal.h" 38 #include "gimplify.h" 39 #include "gimple-iterator.h" 40 #include "gimplify-me.h" 41 #include "tree-ssa-loop-ivopts.h" 42 #include "tree-ssa-loop-manip.h" 43 #include "tree-ssa-loop-niter.h" 44 #include "tree-ssa-loop.h" 45 #include "cfgloop.h" 46 #include "params.h" 47 #include "tree-scalar-evolution.h" 48 #include "tree-vectorizer.h" 49 #include "gimple-fold.h" 50 #include "cgraph.h" 51 #include "tree-cfg.h" 52 #include "tree-if-conv.h" 53 #include "internal-fn.h" 54 #include "tree-vector-builder.h" 55 #include "vec-perm-indices.h" 56 #include "tree-eh.h" 57 58 /* Loop Vectorization Pass. 59 60 This pass tries to vectorize loops. 61 62 For example, the vectorizer transforms the following simple loop: 63 64 short a[N]; short b[N]; short c[N]; int i; 65 66 for (i=0; i<N; i++){ 67 a[i] = b[i] + c[i]; 68 } 69 70 as if it was manually vectorized by rewriting the source code into: 71 72 typedef int __attribute__((mode(V8HI))) v8hi; 73 short a[N]; short b[N]; short c[N]; int i; 74 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c; 75 v8hi va, vb, vc; 76 77 for (i=0; i<N/8; i++){ 78 vb = pb[i]; 79 vc = pc[i]; 80 va = vb + vc; 81 pa[i] = va; 82 } 83 84 The main entry to this pass is vectorize_loops(), in which 85 the vectorizer applies a set of analyses on a given set of loops, 86 followed by the actual vectorization transformation for the loops that 87 had successfully passed the analysis phase. 88 Throughout this pass we make a distinction between two types of 89 data: scalars (which are represented by SSA_NAMES), and memory references 90 ("data-refs"). These two types of data require different handling both 91 during analysis and transformation. The types of data-refs that the 92 vectorizer currently supports are ARRAY_REFS which base is an array DECL 93 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer 94 accesses are required to have a simple (consecutive) access pattern. 95 96 Analysis phase: 97 =============== 98 The driver for the analysis phase is vect_analyze_loop(). 99 It applies a set of analyses, some of which rely on the scalar evolution 100 analyzer (scev) developed by Sebastian Pop. 101 102 During the analysis phase the vectorizer records some information 103 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the 104 loop, as well as general information about the loop as a whole, which is 105 recorded in a "loop_vec_info" struct attached to each loop. 106 107 Transformation phase: 108 ===================== 109 The loop transformation phase scans all the stmts in the loop, and 110 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in 111 the loop that needs to be vectorized. It inserts the vector code sequence 112 just before the scalar stmt S, and records a pointer to the vector code 113 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct 114 attached to S). This pointer will be used for the vectorization of following 115 stmts which use the def of stmt S. Stmt S is removed if it writes to memory; 116 otherwise, we rely on dead code elimination for removing it. 117 118 For example, say stmt S1 was vectorized into stmt VS1: 119 120 VS1: vb = px[i]; 121 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1 122 S2: a = b; 123 124 To vectorize stmt S2, the vectorizer first finds the stmt that defines 125 the operand 'b' (S1), and gets the relevant vector def 'vb' from the 126 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The 127 resulting sequence would be: 128 129 VS1: vb = px[i]; 130 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1 131 VS2: va = vb; 132 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2 133 134 Operands that are not SSA_NAMEs, are data-refs that appear in 135 load/store operations (like 'x[i]' in S1), and are handled differently. 136 137 Target modeling: 138 ================= 139 Currently the only target specific information that is used is the 140 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". 141 Targets that can support different sizes of vectors, for now will need 142 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More 143 flexibility will be added in the future. 144 145 Since we only vectorize operations which vector form can be 146 expressed using existing tree codes, to verify that an operation is 147 supported, the vectorizer checks the relevant optab at the relevant 148 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If 149 the value found is CODE_FOR_nothing, then there's no target support, and 150 we can't vectorize the stmt. 151 152 For additional information on this project see: 153 http://gcc.gnu.org/projects/tree-ssa/vectorization.html 154 */ 155 156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *); 157 158 /* Function vect_determine_vectorization_factor 159 160 Determine the vectorization factor (VF). VF is the number of data elements 161 that are operated upon in parallel in a single iteration of the vectorized 162 loop. For example, when vectorizing a loop that operates on 4byte elements, 163 on a target with vector size (VS) 16byte, the VF is set to 4, since 4 164 elements can fit in a single vector register. 165 166 We currently support vectorization of loops in which all types operated upon 167 are of the same size. Therefore this function currently sets VF according to 168 the size of the types operated upon, and fails if there are multiple sizes 169 in the loop. 170 171 VF is also the factor by which the loop iterations are strip-mined, e.g.: 172 original loop: 173 for (i=0; i<N; i++){ 174 a[i] = b[i] + c[i]; 175 } 176 177 vectorized loop: 178 for (i=0; i<N; i+=VF){ 179 a[i:VF] = b[i:VF] + c[i:VF]; 180 } 181 */ 182 183 static bool 184 vect_determine_vectorization_factor (loop_vec_info loop_vinfo) 185 { 186 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 187 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); 188 unsigned nbbs = loop->num_nodes; 189 poly_uint64 vectorization_factor = 1; 190 tree scalar_type = NULL_TREE; 191 gphi *phi; 192 tree vectype; 193 stmt_vec_info stmt_info; 194 unsigned i; 195 HOST_WIDE_INT dummy; 196 gimple *stmt, *pattern_stmt = NULL; 197 gimple_seq pattern_def_seq = NULL; 198 gimple_stmt_iterator pattern_def_si = gsi_none (); 199 bool analyze_pattern_stmt = false; 200 bool bool_result; 201 auto_vec<stmt_vec_info> mask_producers; 202 203 if (dump_enabled_p ()) 204 dump_printf_loc (MSG_NOTE, vect_location, 205 "=== vect_determine_vectorization_factor ===\n"); 206 207 for (i = 0; i < nbbs; i++) 208 { 209 basic_block bb = bbs[i]; 210 211 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si); 212 gsi_next (&si)) 213 { 214 phi = si.phi (); 215 stmt_info = vinfo_for_stmt (phi); 216 if (dump_enabled_p ()) 217 { 218 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: "); 219 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0); 220 } 221 222 gcc_assert (stmt_info); 223 224 if (STMT_VINFO_RELEVANT_P (stmt_info) 225 || STMT_VINFO_LIVE_P (stmt_info)) 226 { 227 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info)); 228 scalar_type = TREE_TYPE (PHI_RESULT (phi)); 229 230 if (dump_enabled_p ()) 231 { 232 dump_printf_loc (MSG_NOTE, vect_location, 233 "get vectype for scalar type: "); 234 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type); 235 dump_printf (MSG_NOTE, "\n"); 236 } 237 238 vectype = get_vectype_for_scalar_type (scalar_type); 239 if (!vectype) 240 { 241 if (dump_enabled_p ()) 242 { 243 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 244 "not vectorized: unsupported " 245 "data-type "); 246 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 247 scalar_type); 248 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 249 } 250 return false; 251 } 252 STMT_VINFO_VECTYPE (stmt_info) = vectype; 253 254 if (dump_enabled_p ()) 255 { 256 dump_printf_loc (MSG_NOTE, vect_location, "vectype: "); 257 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype); 258 dump_printf (MSG_NOTE, "\n"); 259 } 260 261 if (dump_enabled_p ()) 262 { 263 dump_printf_loc (MSG_NOTE, vect_location, "nunits = "); 264 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype)); 265 dump_printf (MSG_NOTE, "\n"); 266 } 267 268 vect_update_max_nunits (&vectorization_factor, vectype); 269 } 270 } 271 272 for (gimple_stmt_iterator si = gsi_start_bb (bb); 273 !gsi_end_p (si) || analyze_pattern_stmt;) 274 { 275 tree vf_vectype; 276 277 if (analyze_pattern_stmt) 278 stmt = pattern_stmt; 279 else 280 stmt = gsi_stmt (si); 281 282 stmt_info = vinfo_for_stmt (stmt); 283 284 if (dump_enabled_p ()) 285 { 286 dump_printf_loc (MSG_NOTE, vect_location, 287 "==> examining statement: "); 288 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0); 289 } 290 291 gcc_assert (stmt_info); 292 293 /* Skip stmts which do not need to be vectorized. */ 294 if ((!STMT_VINFO_RELEVANT_P (stmt_info) 295 && !STMT_VINFO_LIVE_P (stmt_info)) 296 || gimple_clobber_p (stmt)) 297 { 298 if (STMT_VINFO_IN_PATTERN_P (stmt_info) 299 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info)) 300 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt)) 301 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt)))) 302 { 303 stmt = pattern_stmt; 304 stmt_info = vinfo_for_stmt (pattern_stmt); 305 if (dump_enabled_p ()) 306 { 307 dump_printf_loc (MSG_NOTE, vect_location, 308 "==> examining pattern statement: "); 309 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0); 310 } 311 } 312 else 313 { 314 if (dump_enabled_p ()) 315 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n"); 316 gsi_next (&si); 317 continue; 318 } 319 } 320 else if (STMT_VINFO_IN_PATTERN_P (stmt_info) 321 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info)) 322 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt)) 323 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt)))) 324 analyze_pattern_stmt = true; 325 326 /* If a pattern statement has def stmts, analyze them too. */ 327 if (is_pattern_stmt_p (stmt_info)) 328 { 329 if (pattern_def_seq == NULL) 330 { 331 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info); 332 pattern_def_si = gsi_start (pattern_def_seq); 333 } 334 else if (!gsi_end_p (pattern_def_si)) 335 gsi_next (&pattern_def_si); 336 if (pattern_def_seq != NULL) 337 { 338 gimple *pattern_def_stmt = NULL; 339 stmt_vec_info pattern_def_stmt_info = NULL; 340 341 while (!gsi_end_p (pattern_def_si)) 342 { 343 pattern_def_stmt = gsi_stmt (pattern_def_si); 344 pattern_def_stmt_info 345 = vinfo_for_stmt (pattern_def_stmt); 346 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info) 347 || STMT_VINFO_LIVE_P (pattern_def_stmt_info)) 348 break; 349 gsi_next (&pattern_def_si); 350 } 351 352 if (!gsi_end_p (pattern_def_si)) 353 { 354 if (dump_enabled_p ()) 355 { 356 dump_printf_loc (MSG_NOTE, vect_location, 357 "==> examining pattern def stmt: "); 358 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, 359 pattern_def_stmt, 0); 360 } 361 362 stmt = pattern_def_stmt; 363 stmt_info = pattern_def_stmt_info; 364 } 365 else 366 { 367 pattern_def_si = gsi_none (); 368 analyze_pattern_stmt = false; 369 } 370 } 371 else 372 analyze_pattern_stmt = false; 373 } 374 375 if (gimple_get_lhs (stmt) == NULL_TREE 376 /* MASK_STORE has no lhs, but is ok. */ 377 && (!is_gimple_call (stmt) 378 || !gimple_call_internal_p (stmt) 379 || gimple_call_internal_fn (stmt) != IFN_MASK_STORE)) 380 { 381 if (is_gimple_call (stmt)) 382 { 383 /* Ignore calls with no lhs. These must be calls to 384 #pragma omp simd functions, and what vectorization factor 385 it really needs can't be determined until 386 vectorizable_simd_clone_call. */ 387 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si)) 388 { 389 pattern_def_seq = NULL; 390 gsi_next (&si); 391 } 392 continue; 393 } 394 if (dump_enabled_p ()) 395 { 396 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 397 "not vectorized: irregular stmt."); 398 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 399 0); 400 } 401 return false; 402 } 403 404 if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt)))) 405 { 406 if (dump_enabled_p ()) 407 { 408 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 409 "not vectorized: vector stmt in loop:"); 410 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 411 } 412 return false; 413 } 414 415 bool_result = false; 416 417 if (STMT_VINFO_VECTYPE (stmt_info)) 418 { 419 /* The only case when a vectype had been already set is for stmts 420 that contain a dataref, or for "pattern-stmts" (stmts 421 generated by the vectorizer to represent/replace a certain 422 idiom). */ 423 gcc_assert (STMT_VINFO_DATA_REF (stmt_info) 424 || is_pattern_stmt_p (stmt_info) 425 || !gsi_end_p (pattern_def_si)); 426 vectype = STMT_VINFO_VECTYPE (stmt_info); 427 } 428 else 429 { 430 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info)); 431 if (gimple_call_internal_p (stmt, IFN_MASK_STORE)) 432 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3)); 433 else 434 scalar_type = TREE_TYPE (gimple_get_lhs (stmt)); 435 436 /* Bool ops don't participate in vectorization factor 437 computation. For comparison use compared types to 438 compute a factor. */ 439 if (VECT_SCALAR_BOOLEAN_TYPE_P (scalar_type) 440 && is_gimple_assign (stmt) 441 && gimple_assign_rhs_code (stmt) != COND_EXPR) 442 { 443 if (STMT_VINFO_RELEVANT_P (stmt_info) 444 || STMT_VINFO_LIVE_P (stmt_info)) 445 mask_producers.safe_push (stmt_info); 446 bool_result = true; 447 448 if (TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) 449 == tcc_comparison 450 && !VECT_SCALAR_BOOLEAN_TYPE_P 451 (TREE_TYPE (gimple_assign_rhs1 (stmt)))) 452 scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt)); 453 else 454 { 455 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si)) 456 { 457 pattern_def_seq = NULL; 458 gsi_next (&si); 459 } 460 continue; 461 } 462 } 463 464 if (dump_enabled_p ()) 465 { 466 dump_printf_loc (MSG_NOTE, vect_location, 467 "get vectype for scalar type: "); 468 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type); 469 dump_printf (MSG_NOTE, "\n"); 470 } 471 vectype = get_vectype_for_scalar_type (scalar_type); 472 if (!vectype) 473 { 474 if (dump_enabled_p ()) 475 { 476 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 477 "not vectorized: unsupported " 478 "data-type "); 479 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 480 scalar_type); 481 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 482 } 483 return false; 484 } 485 486 if (!bool_result) 487 STMT_VINFO_VECTYPE (stmt_info) = vectype; 488 489 if (dump_enabled_p ()) 490 { 491 dump_printf_loc (MSG_NOTE, vect_location, "vectype: "); 492 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype); 493 dump_printf (MSG_NOTE, "\n"); 494 } 495 } 496 497 /* Don't try to compute VF out scalar types if we stmt 498 produces boolean vector. Use result vectype instead. */ 499 if (VECTOR_BOOLEAN_TYPE_P (vectype)) 500 vf_vectype = vectype; 501 else 502 { 503 /* The vectorization factor is according to the smallest 504 scalar type (or the largest vector size, but we only 505 support one vector size per loop). */ 506 if (!bool_result) 507 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy, 508 &dummy); 509 if (dump_enabled_p ()) 510 { 511 dump_printf_loc (MSG_NOTE, vect_location, 512 "get vectype for scalar type: "); 513 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type); 514 dump_printf (MSG_NOTE, "\n"); 515 } 516 vf_vectype = get_vectype_for_scalar_type (scalar_type); 517 } 518 if (!vf_vectype) 519 { 520 if (dump_enabled_p ()) 521 { 522 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 523 "not vectorized: unsupported data-type "); 524 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 525 scalar_type); 526 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 527 } 528 return false; 529 } 530 531 if (maybe_ne (GET_MODE_SIZE (TYPE_MODE (vectype)), 532 GET_MODE_SIZE (TYPE_MODE (vf_vectype)))) 533 { 534 if (dump_enabled_p ()) 535 { 536 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 537 "not vectorized: different sized vector " 538 "types in statement, "); 539 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 540 vectype); 541 dump_printf (MSG_MISSED_OPTIMIZATION, " and "); 542 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 543 vf_vectype); 544 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 545 } 546 return false; 547 } 548 549 if (dump_enabled_p ()) 550 { 551 dump_printf_loc (MSG_NOTE, vect_location, "vectype: "); 552 dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype); 553 dump_printf (MSG_NOTE, "\n"); 554 } 555 556 if (dump_enabled_p ()) 557 { 558 dump_printf_loc (MSG_NOTE, vect_location, "nunits = "); 559 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vf_vectype)); 560 dump_printf (MSG_NOTE, "\n"); 561 } 562 563 vect_update_max_nunits (&vectorization_factor, vf_vectype); 564 565 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si)) 566 { 567 pattern_def_seq = NULL; 568 gsi_next (&si); 569 } 570 } 571 } 572 573 /* TODO: Analyze cost. Decide if worth while to vectorize. */ 574 if (dump_enabled_p ()) 575 { 576 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = "); 577 dump_dec (MSG_NOTE, vectorization_factor); 578 dump_printf (MSG_NOTE, "\n"); 579 } 580 581 if (known_le (vectorization_factor, 1U)) 582 { 583 if (dump_enabled_p ()) 584 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 585 "not vectorized: unsupported data-type\n"); 586 return false; 587 } 588 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor; 589 590 for (i = 0; i < mask_producers.length (); i++) 591 { 592 tree mask_type = NULL; 593 594 stmt = STMT_VINFO_STMT (mask_producers[i]); 595 596 if (is_gimple_assign (stmt) 597 && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison 598 && !VECT_SCALAR_BOOLEAN_TYPE_P 599 (TREE_TYPE (gimple_assign_rhs1 (stmt)))) 600 { 601 scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt)); 602 mask_type = get_mask_type_for_scalar_type (scalar_type); 603 604 if (!mask_type) 605 { 606 if (dump_enabled_p ()) 607 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 608 "not vectorized: unsupported mask\n"); 609 return false; 610 } 611 } 612 else 613 { 614 tree rhs; 615 ssa_op_iter iter; 616 gimple *def_stmt; 617 enum vect_def_type dt; 618 619 FOR_EACH_SSA_TREE_OPERAND (rhs, stmt, iter, SSA_OP_USE) 620 { 621 if (!vect_is_simple_use (rhs, mask_producers[i]->vinfo, 622 &def_stmt, &dt, &vectype)) 623 { 624 if (dump_enabled_p ()) 625 { 626 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 627 "not vectorized: can't compute mask type " 628 "for statement, "); 629 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 630 0); 631 } 632 return false; 633 } 634 635 /* No vectype probably means external definition. 636 Allow it in case there is another operand which 637 allows to determine mask type. */ 638 if (!vectype) 639 continue; 640 641 if (!mask_type) 642 mask_type = vectype; 643 else if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type), 644 TYPE_VECTOR_SUBPARTS (vectype))) 645 { 646 if (dump_enabled_p ()) 647 { 648 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 649 "not vectorized: different sized masks " 650 "types in statement, "); 651 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 652 mask_type); 653 dump_printf (MSG_MISSED_OPTIMIZATION, " and "); 654 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 655 vectype); 656 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 657 } 658 return false; 659 } 660 else if (VECTOR_BOOLEAN_TYPE_P (mask_type) 661 != VECTOR_BOOLEAN_TYPE_P (vectype)) 662 { 663 if (dump_enabled_p ()) 664 { 665 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 666 "not vectorized: mixed mask and " 667 "nonmask vector types in statement, "); 668 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 669 mask_type); 670 dump_printf (MSG_MISSED_OPTIMIZATION, " and "); 671 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 672 vectype); 673 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 674 } 675 return false; 676 } 677 } 678 679 /* We may compare boolean value loaded as vector of integers. 680 Fix mask_type in such case. */ 681 if (mask_type 682 && !VECTOR_BOOLEAN_TYPE_P (mask_type) 683 && gimple_code (stmt) == GIMPLE_ASSIGN 684 && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison) 685 mask_type = build_same_sized_truth_vector_type (mask_type); 686 } 687 688 /* No mask_type should mean loop invariant predicate. 689 This is probably a subject for optimization in 690 if-conversion. */ 691 if (!mask_type) 692 { 693 if (dump_enabled_p ()) 694 { 695 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 696 "not vectorized: can't compute mask type " 697 "for statement, "); 698 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 699 0); 700 } 701 return false; 702 } 703 704 STMT_VINFO_VECTYPE (mask_producers[i]) = mask_type; 705 } 706 707 return true; 708 } 709 710 711 /* Function vect_is_simple_iv_evolution. 712 713 FORNOW: A simple evolution of an induction variables in the loop is 714 considered a polynomial evolution. */ 715 716 static bool 717 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init, 718 tree * step) 719 { 720 tree init_expr; 721 tree step_expr; 722 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb); 723 basic_block bb; 724 725 /* When there is no evolution in this loop, the evolution function 726 is not "simple". */ 727 if (evolution_part == NULL_TREE) 728 return false; 729 730 /* When the evolution is a polynomial of degree >= 2 731 the evolution function is not "simple". */ 732 if (tree_is_chrec (evolution_part)) 733 return false; 734 735 step_expr = evolution_part; 736 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb)); 737 738 if (dump_enabled_p ()) 739 { 740 dump_printf_loc (MSG_NOTE, vect_location, "step: "); 741 dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr); 742 dump_printf (MSG_NOTE, ", init: "); 743 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr); 744 dump_printf (MSG_NOTE, "\n"); 745 } 746 747 *init = init_expr; 748 *step = step_expr; 749 750 if (TREE_CODE (step_expr) != INTEGER_CST 751 && (TREE_CODE (step_expr) != SSA_NAME 752 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr))) 753 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb)) 754 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr)) 755 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)) 756 || !flag_associative_math))) 757 && (TREE_CODE (step_expr) != REAL_CST 758 || !flag_associative_math)) 759 { 760 if (dump_enabled_p ()) 761 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 762 "step unknown.\n"); 763 return false; 764 } 765 766 return true; 767 } 768 769 /* Function vect_analyze_scalar_cycles_1. 770 771 Examine the cross iteration def-use cycles of scalar variables 772 in LOOP. LOOP_VINFO represents the loop that is now being 773 considered for vectorization (can be LOOP, or an outer-loop 774 enclosing LOOP). */ 775 776 static void 777 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop) 778 { 779 basic_block bb = loop->header; 780 tree init, step; 781 auto_vec<gimple *, 64> worklist; 782 gphi_iterator gsi; 783 bool double_reduc; 784 785 if (dump_enabled_p ()) 786 dump_printf_loc (MSG_NOTE, vect_location, 787 "=== vect_analyze_scalar_cycles ===\n"); 788 789 /* First - identify all inductions. Reduction detection assumes that all the 790 inductions have been identified, therefore, this order must not be 791 changed. */ 792 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi)) 793 { 794 gphi *phi = gsi.phi (); 795 tree access_fn = NULL; 796 tree def = PHI_RESULT (phi); 797 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi); 798 799 if (dump_enabled_p ()) 800 { 801 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: "); 802 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0); 803 } 804 805 /* Skip virtual phi's. The data dependences that are associated with 806 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */ 807 if (virtual_operand_p (def)) 808 continue; 809 810 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type; 811 812 /* Analyze the evolution function. */ 813 access_fn = analyze_scalar_evolution (loop, def); 814 if (access_fn) 815 { 816 STRIP_NOPS (access_fn); 817 if (dump_enabled_p ()) 818 { 819 dump_printf_loc (MSG_NOTE, vect_location, 820 "Access function of PHI: "); 821 dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn); 822 dump_printf (MSG_NOTE, "\n"); 823 } 824 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo) 825 = initial_condition_in_loop_num (access_fn, loop->num); 826 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) 827 = evolution_part_in_loop_num (access_fn, loop->num); 828 } 829 830 if (!access_fn 831 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step) 832 || (LOOP_VINFO_LOOP (loop_vinfo) != loop 833 && TREE_CODE (step) != INTEGER_CST)) 834 { 835 worklist.safe_push (phi); 836 continue; 837 } 838 839 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo) 840 != NULL_TREE); 841 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE); 842 843 if (dump_enabled_p ()) 844 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n"); 845 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def; 846 } 847 848 849 /* Second - identify all reductions and nested cycles. */ 850 while (worklist.length () > 0) 851 { 852 gimple *phi = worklist.pop (); 853 tree def = PHI_RESULT (phi); 854 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi); 855 gimple *reduc_stmt; 856 857 if (dump_enabled_p ()) 858 { 859 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: "); 860 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0); 861 } 862 863 gcc_assert (!virtual_operand_p (def) 864 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type); 865 866 reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, 867 &double_reduc, false); 868 if (reduc_stmt) 869 { 870 if (double_reduc) 871 { 872 if (dump_enabled_p ()) 873 dump_printf_loc (MSG_NOTE, vect_location, 874 "Detected double reduction.\n"); 875 876 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def; 877 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) = 878 vect_double_reduction_def; 879 } 880 else 881 { 882 if (loop != LOOP_VINFO_LOOP (loop_vinfo)) 883 { 884 if (dump_enabled_p ()) 885 dump_printf_loc (MSG_NOTE, vect_location, 886 "Detected vectorizable nested cycle.\n"); 887 888 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle; 889 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) = 890 vect_nested_cycle; 891 } 892 else 893 { 894 if (dump_enabled_p ()) 895 dump_printf_loc (MSG_NOTE, vect_location, 896 "Detected reduction.\n"); 897 898 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def; 899 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) = 900 vect_reduction_def; 901 /* Store the reduction cycles for possible vectorization in 902 loop-aware SLP if it was not detected as reduction 903 chain. */ 904 if (! GROUP_FIRST_ELEMENT (vinfo_for_stmt (reduc_stmt))) 905 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt); 906 } 907 } 908 } 909 else 910 if (dump_enabled_p ()) 911 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 912 "Unknown def-use cycle pattern.\n"); 913 } 914 } 915 916 917 /* Function vect_analyze_scalar_cycles. 918 919 Examine the cross iteration def-use cycles of scalar variables, by 920 analyzing the loop-header PHIs of scalar variables. Classify each 921 cycle as one of the following: invariant, induction, reduction, unknown. 922 We do that for the loop represented by LOOP_VINFO, and also to its 923 inner-loop, if exists. 924 Examples for scalar cycles: 925 926 Example1: reduction: 927 928 loop1: 929 for (i=0; i<N; i++) 930 sum += a[i]; 931 932 Example2: induction: 933 934 loop2: 935 for (i=0; i<N; i++) 936 a[i] = i; */ 937 938 static void 939 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo) 940 { 941 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 942 943 vect_analyze_scalar_cycles_1 (loop_vinfo, loop); 944 945 /* When vectorizing an outer-loop, the inner-loop is executed sequentially. 946 Reductions in such inner-loop therefore have different properties than 947 the reductions in the nest that gets vectorized: 948 1. When vectorized, they are executed in the same order as in the original 949 scalar loop, so we can't change the order of computation when 950 vectorizing them. 951 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the 952 current checks are too strict. */ 953 954 if (loop->inner) 955 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner); 956 } 957 958 /* Transfer group and reduction information from STMT to its pattern stmt. */ 959 960 static void 961 vect_fixup_reduc_chain (gimple *stmt) 962 { 963 gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt)); 964 gimple *stmtp; 965 gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp)) 966 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))); 967 GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt)); 968 do 969 { 970 stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt)); 971 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp; 972 stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt)); 973 if (stmt) 974 GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp)) 975 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt)); 976 } 977 while (stmt); 978 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def; 979 } 980 981 /* Fixup scalar cycles that now have their stmts detected as patterns. */ 982 983 static void 984 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo) 985 { 986 gimple *first; 987 unsigned i; 988 989 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first) 990 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first))) 991 { 992 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first)); 993 while (next) 994 { 995 if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next))) 996 break; 997 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next)); 998 } 999 /* If not all stmt in the chain are patterns try to handle 1000 the chain without patterns. */ 1001 if (! next) 1002 { 1003 vect_fixup_reduc_chain (first); 1004 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i] 1005 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first)); 1006 } 1007 } 1008 } 1009 1010 /* Function vect_get_loop_niters. 1011 1012 Determine how many iterations the loop is executed and place it 1013 in NUMBER_OF_ITERATIONS. Place the number of latch iterations 1014 in NUMBER_OF_ITERATIONSM1. Place the condition under which the 1015 niter information holds in ASSUMPTIONS. 1016 1017 Return the loop exit condition. */ 1018 1019 1020 static gcond * 1021 vect_get_loop_niters (struct loop *loop, tree *assumptions, 1022 tree *number_of_iterations, tree *number_of_iterationsm1) 1023 { 1024 edge exit = single_exit (loop); 1025 struct tree_niter_desc niter_desc; 1026 tree niter_assumptions, niter, may_be_zero; 1027 gcond *cond = get_loop_exit_condition (loop); 1028 1029 *assumptions = boolean_true_node; 1030 *number_of_iterationsm1 = chrec_dont_know; 1031 *number_of_iterations = chrec_dont_know; 1032 if (dump_enabled_p ()) 1033 dump_printf_loc (MSG_NOTE, vect_location, 1034 "=== get_loop_niters ===\n"); 1035 1036 if (!exit) 1037 return cond; 1038 1039 niter = chrec_dont_know; 1040 may_be_zero = NULL_TREE; 1041 niter_assumptions = boolean_true_node; 1042 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL) 1043 || chrec_contains_undetermined (niter_desc.niter)) 1044 return cond; 1045 1046 niter_assumptions = niter_desc.assumptions; 1047 may_be_zero = niter_desc.may_be_zero; 1048 niter = niter_desc.niter; 1049 1050 if (may_be_zero && integer_zerop (may_be_zero)) 1051 may_be_zero = NULL_TREE; 1052 1053 if (may_be_zero) 1054 { 1055 if (COMPARISON_CLASS_P (may_be_zero)) 1056 { 1057 /* Try to combine may_be_zero with assumptions, this can simplify 1058 computation of niter expression. */ 1059 if (niter_assumptions && !integer_nonzerop (niter_assumptions)) 1060 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node, 1061 niter_assumptions, 1062 fold_build1 (TRUTH_NOT_EXPR, 1063 boolean_type_node, 1064 may_be_zero)); 1065 else 1066 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero, 1067 build_int_cst (TREE_TYPE (niter), 0), 1068 rewrite_to_non_trapping_overflow (niter)); 1069 1070 may_be_zero = NULL_TREE; 1071 } 1072 else if (integer_nonzerop (may_be_zero)) 1073 { 1074 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0); 1075 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1); 1076 return cond; 1077 } 1078 else 1079 return cond; 1080 } 1081 1082 *assumptions = niter_assumptions; 1083 *number_of_iterationsm1 = niter; 1084 1085 /* We want the number of loop header executions which is the number 1086 of latch executions plus one. 1087 ??? For UINT_MAX latch executions this number overflows to zero 1088 for loops like do { n++; } while (n != 0); */ 1089 if (niter && !chrec_contains_undetermined (niter)) 1090 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter), 1091 build_int_cst (TREE_TYPE (niter), 1)); 1092 *number_of_iterations = niter; 1093 1094 return cond; 1095 } 1096 1097 /* Function bb_in_loop_p 1098 1099 Used as predicate for dfs order traversal of the loop bbs. */ 1100 1101 static bool 1102 bb_in_loop_p (const_basic_block bb, const void *data) 1103 { 1104 const struct loop *const loop = (const struct loop *)data; 1105 if (flow_bb_inside_loop_p (loop, bb)) 1106 return true; 1107 return false; 1108 } 1109 1110 1111 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as 1112 stmt_vec_info structs for all the stmts in LOOP_IN. */ 1113 1114 _loop_vec_info::_loop_vec_info (struct loop *loop_in) 1115 : vec_info (vec_info::loop, init_cost (loop_in)), 1116 loop (loop_in), 1117 bbs (XCNEWVEC (basic_block, loop->num_nodes)), 1118 num_itersm1 (NULL_TREE), 1119 num_iters (NULL_TREE), 1120 num_iters_unchanged (NULL_TREE), 1121 num_iters_assumptions (NULL_TREE), 1122 th (0), 1123 versioning_threshold (0), 1124 vectorization_factor (0), 1125 max_vectorization_factor (0), 1126 mask_skip_niters (NULL_TREE), 1127 mask_compare_type (NULL_TREE), 1128 unaligned_dr (NULL), 1129 peeling_for_alignment (0), 1130 ptr_mask (0), 1131 ivexpr_map (NULL), 1132 slp_unrolling_factor (1), 1133 single_scalar_iteration_cost (0), 1134 vectorizable (false), 1135 can_fully_mask_p (true), 1136 fully_masked_p (false), 1137 peeling_for_gaps (false), 1138 peeling_for_niter (false), 1139 operands_swapped (false), 1140 no_data_dependencies (false), 1141 has_mask_store (false), 1142 scalar_loop (NULL), 1143 orig_loop_info (NULL) 1144 { 1145 /* Create/Update stmt_info for all stmts in the loop. */ 1146 basic_block *body = get_loop_body (loop); 1147 for (unsigned int i = 0; i < loop->num_nodes; i++) 1148 { 1149 basic_block bb = body[i]; 1150 gimple_stmt_iterator si; 1151 1152 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si)) 1153 { 1154 gimple *phi = gsi_stmt (si); 1155 gimple_set_uid (phi, 0); 1156 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, this)); 1157 } 1158 1159 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si)) 1160 { 1161 gimple *stmt = gsi_stmt (si); 1162 gimple_set_uid (stmt, 0); 1163 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, this)); 1164 } 1165 } 1166 free (body); 1167 1168 /* CHECKME: We want to visit all BBs before their successors (except for 1169 latch blocks, for which this assertion wouldn't hold). In the simple 1170 case of the loop forms we allow, a dfs order of the BBs would the same 1171 as reversed postorder traversal, so we are safe. */ 1172 1173 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p, 1174 bbs, loop->num_nodes, loop); 1175 gcc_assert (nbbs == loop->num_nodes); 1176 } 1177 1178 /* Free all levels of MASKS. */ 1179 1180 void 1181 release_vec_loop_masks (vec_loop_masks *masks) 1182 { 1183 rgroup_masks *rgm; 1184 unsigned int i; 1185 FOR_EACH_VEC_ELT (*masks, i, rgm) 1186 rgm->masks.release (); 1187 masks->release (); 1188 } 1189 1190 /* Free all memory used by the _loop_vec_info, as well as all the 1191 stmt_vec_info structs of all the stmts in the loop. */ 1192 1193 _loop_vec_info::~_loop_vec_info () 1194 { 1195 int nbbs; 1196 gimple_stmt_iterator si; 1197 int j; 1198 1199 nbbs = loop->num_nodes; 1200 for (j = 0; j < nbbs; j++) 1201 { 1202 basic_block bb = bbs[j]; 1203 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si)) 1204 free_stmt_vec_info (gsi_stmt (si)); 1205 1206 for (si = gsi_start_bb (bb); !gsi_end_p (si); ) 1207 { 1208 gimple *stmt = gsi_stmt (si); 1209 1210 /* We may have broken canonical form by moving a constant 1211 into RHS1 of a commutative op. Fix such occurrences. */ 1212 if (operands_swapped && is_gimple_assign (stmt)) 1213 { 1214 enum tree_code code = gimple_assign_rhs_code (stmt); 1215 1216 if ((code == PLUS_EXPR 1217 || code == POINTER_PLUS_EXPR 1218 || code == MULT_EXPR) 1219 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt))) 1220 swap_ssa_operands (stmt, 1221 gimple_assign_rhs1_ptr (stmt), 1222 gimple_assign_rhs2_ptr (stmt)); 1223 else if (code == COND_EXPR 1224 && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt))) 1225 { 1226 tree cond_expr = gimple_assign_rhs1 (stmt); 1227 enum tree_code cond_code = TREE_CODE (cond_expr); 1228 1229 if (TREE_CODE_CLASS (cond_code) == tcc_comparison) 1230 { 1231 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 1232 0)); 1233 cond_code = invert_tree_comparison (cond_code, 1234 honor_nans); 1235 if (cond_code != ERROR_MARK) 1236 { 1237 TREE_SET_CODE (cond_expr, cond_code); 1238 swap_ssa_operands (stmt, 1239 gimple_assign_rhs2_ptr (stmt), 1240 gimple_assign_rhs3_ptr (stmt)); 1241 } 1242 } 1243 } 1244 } 1245 1246 /* Free stmt_vec_info. */ 1247 free_stmt_vec_info (stmt); 1248 gsi_next (&si); 1249 } 1250 } 1251 1252 free (bbs); 1253 1254 release_vec_loop_masks (&masks); 1255 delete ivexpr_map; 1256 1257 loop->aux = NULL; 1258 } 1259 1260 /* Return an invariant or register for EXPR and emit necessary 1261 computations in the LOOP_VINFO loop preheader. */ 1262 1263 tree 1264 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr) 1265 { 1266 if (is_gimple_reg (expr) 1267 || is_gimple_min_invariant (expr)) 1268 return expr; 1269 1270 if (! loop_vinfo->ivexpr_map) 1271 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>; 1272 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr); 1273 if (! cached) 1274 { 1275 gimple_seq stmts = NULL; 1276 cached = force_gimple_operand (unshare_expr (expr), 1277 &stmts, true, NULL_TREE); 1278 if (stmts) 1279 { 1280 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo)); 1281 gsi_insert_seq_on_edge_immediate (e, stmts); 1282 } 1283 } 1284 return cached; 1285 } 1286 1287 /* Return true if we can use CMP_TYPE as the comparison type to produce 1288 all masks required to mask LOOP_VINFO. */ 1289 1290 static bool 1291 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type) 1292 { 1293 rgroup_masks *rgm; 1294 unsigned int i; 1295 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm) 1296 if (rgm->mask_type != NULL_TREE 1297 && !direct_internal_fn_supported_p (IFN_WHILE_ULT, 1298 cmp_type, rgm->mask_type, 1299 OPTIMIZE_FOR_SPEED)) 1300 return false; 1301 return true; 1302 } 1303 1304 /* Calculate the maximum number of scalars per iteration for every 1305 rgroup in LOOP_VINFO. */ 1306 1307 static unsigned int 1308 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo) 1309 { 1310 unsigned int res = 1; 1311 unsigned int i; 1312 rgroup_masks *rgm; 1313 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm) 1314 res = MAX (res, rgm->max_nscalars_per_iter); 1315 return res; 1316 } 1317 1318 /* Each statement in LOOP_VINFO can be masked where necessary. Check 1319 whether we can actually generate the masks required. Return true if so, 1320 storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE. */ 1321 1322 static bool 1323 vect_verify_full_masking (loop_vec_info loop_vinfo) 1324 { 1325 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 1326 unsigned int min_ni_width; 1327 1328 /* Use a normal loop if there are no statements that need masking. 1329 This only happens in rare degenerate cases: it means that the loop 1330 has no loads, no stores, and no live-out values. */ 1331 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ()) 1332 return false; 1333 1334 /* Get the maximum number of iterations that is representable 1335 in the counter type. */ 1336 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo)); 1337 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1; 1338 1339 /* Get a more refined estimate for the number of iterations. */ 1340 widest_int max_back_edges; 1341 if (max_loop_iterations (loop, &max_back_edges)) 1342 max_ni = wi::smin (max_ni, max_back_edges + 1); 1343 1344 /* Account for rgroup masks, in which each bit is replicated N times. */ 1345 max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo); 1346 1347 /* Work out how many bits we need to represent the limit. */ 1348 min_ni_width = wi::min_precision (max_ni, UNSIGNED); 1349 1350 /* Find a scalar mode for which WHILE_ULT is supported. */ 1351 opt_scalar_int_mode cmp_mode_iter; 1352 tree cmp_type = NULL_TREE; 1353 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT) 1354 { 1355 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ()); 1356 if (cmp_bits >= min_ni_width 1357 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ())) 1358 { 1359 tree this_type = build_nonstandard_integer_type (cmp_bits, true); 1360 if (this_type 1361 && can_produce_all_loop_masks_p (loop_vinfo, this_type)) 1362 { 1363 /* Although we could stop as soon as we find a valid mode, 1364 it's often better to continue until we hit Pmode, since the 1365 operands to the WHILE are more likely to be reusable in 1366 address calculations. */ 1367 cmp_type = this_type; 1368 if (cmp_bits >= GET_MODE_BITSIZE (Pmode)) 1369 break; 1370 } 1371 } 1372 } 1373 1374 if (!cmp_type) 1375 return false; 1376 1377 LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type; 1378 return true; 1379 } 1380 1381 /* Calculate the cost of one scalar iteration of the loop. */ 1382 static void 1383 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo) 1384 { 1385 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 1386 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); 1387 int nbbs = loop->num_nodes, factor; 1388 int innerloop_iters, i; 1389 1390 /* Gather costs for statements in the scalar loop. */ 1391 1392 /* FORNOW. */ 1393 innerloop_iters = 1; 1394 if (loop->inner) 1395 innerloop_iters = 50; /* FIXME */ 1396 1397 for (i = 0; i < nbbs; i++) 1398 { 1399 gimple_stmt_iterator si; 1400 basic_block bb = bbs[i]; 1401 1402 if (bb->loop_father == loop->inner) 1403 factor = innerloop_iters; 1404 else 1405 factor = 1; 1406 1407 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si)) 1408 { 1409 gimple *stmt = gsi_stmt (si); 1410 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 1411 1412 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt)) 1413 continue; 1414 1415 /* Skip stmts that are not vectorized inside the loop. */ 1416 if (stmt_info 1417 && !STMT_VINFO_RELEVANT_P (stmt_info) 1418 && (!STMT_VINFO_LIVE_P (stmt_info) 1419 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info))) 1420 && !STMT_VINFO_IN_PATTERN_P (stmt_info)) 1421 continue; 1422 1423 vect_cost_for_stmt kind; 1424 if (STMT_VINFO_DATA_REF (stmt_info)) 1425 { 1426 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))) 1427 kind = scalar_load; 1428 else 1429 kind = scalar_store; 1430 } 1431 else 1432 kind = scalar_stmt; 1433 1434 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), 1435 factor, kind, stmt_info, 0, vect_prologue); 1436 } 1437 } 1438 1439 /* Now accumulate cost. */ 1440 void *target_cost_data = init_cost (loop); 1441 stmt_info_for_cost *si; 1442 int j; 1443 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), 1444 j, si) 1445 { 1446 struct _stmt_vec_info *stmt_info 1447 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL; 1448 (void) add_stmt_cost (target_cost_data, si->count, 1449 si->kind, stmt_info, si->misalign, 1450 vect_body); 1451 } 1452 unsigned dummy, body_cost = 0; 1453 finish_cost (target_cost_data, &dummy, &body_cost, &dummy); 1454 destroy_cost_data (target_cost_data); 1455 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost; 1456 } 1457 1458 1459 /* Function vect_analyze_loop_form_1. 1460 1461 Verify that certain CFG restrictions hold, including: 1462 - the loop has a pre-header 1463 - the loop has a single entry and exit 1464 - the loop exit condition is simple enough 1465 - the number of iterations can be analyzed, i.e, a countable loop. The 1466 niter could be analyzed under some assumptions. */ 1467 1468 bool 1469 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond, 1470 tree *assumptions, tree *number_of_iterationsm1, 1471 tree *number_of_iterations, gcond **inner_loop_cond) 1472 { 1473 if (dump_enabled_p ()) 1474 dump_printf_loc (MSG_NOTE, vect_location, 1475 "=== vect_analyze_loop_form ===\n"); 1476 1477 /* Different restrictions apply when we are considering an inner-most loop, 1478 vs. an outer (nested) loop. 1479 (FORNOW. May want to relax some of these restrictions in the future). */ 1480 1481 if (!loop->inner) 1482 { 1483 /* Inner-most loop. We currently require that the number of BBs is 1484 exactly 2 (the header and latch). Vectorizable inner-most loops 1485 look like this: 1486 1487 (pre-header) 1488 | 1489 header <--------+ 1490 | | | 1491 | +--> latch --+ 1492 | 1493 (exit-bb) */ 1494 1495 if (loop->num_nodes != 2) 1496 { 1497 if (dump_enabled_p ()) 1498 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1499 "not vectorized: control flow in loop.\n"); 1500 return false; 1501 } 1502 1503 if (empty_block_p (loop->header)) 1504 { 1505 if (dump_enabled_p ()) 1506 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1507 "not vectorized: empty loop.\n"); 1508 return false; 1509 } 1510 } 1511 else 1512 { 1513 struct loop *innerloop = loop->inner; 1514 edge entryedge; 1515 1516 /* Nested loop. We currently require that the loop is doubly-nested, 1517 contains a single inner loop, and the number of BBs is exactly 5. 1518 Vectorizable outer-loops look like this: 1519 1520 (pre-header) 1521 | 1522 header <---+ 1523 | | 1524 inner-loop | 1525 | | 1526 tail ------+ 1527 | 1528 (exit-bb) 1529 1530 The inner-loop has the properties expected of inner-most loops 1531 as described above. */ 1532 1533 if ((loop->inner)->inner || (loop->inner)->next) 1534 { 1535 if (dump_enabled_p ()) 1536 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1537 "not vectorized: multiple nested loops.\n"); 1538 return false; 1539 } 1540 1541 if (loop->num_nodes != 5) 1542 { 1543 if (dump_enabled_p ()) 1544 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1545 "not vectorized: control flow in loop.\n"); 1546 return false; 1547 } 1548 1549 entryedge = loop_preheader_edge (innerloop); 1550 if (entryedge->src != loop->header 1551 || !single_exit (innerloop) 1552 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src) 1553 { 1554 if (dump_enabled_p ()) 1555 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1556 "not vectorized: unsupported outerloop form.\n"); 1557 return false; 1558 } 1559 1560 /* Analyze the inner-loop. */ 1561 tree inner_niterm1, inner_niter, inner_assumptions; 1562 if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond, 1563 &inner_assumptions, &inner_niterm1, 1564 &inner_niter, NULL) 1565 /* Don't support analyzing niter under assumptions for inner 1566 loop. */ 1567 || !integer_onep (inner_assumptions)) 1568 { 1569 if (dump_enabled_p ()) 1570 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1571 "not vectorized: Bad inner loop.\n"); 1572 return false; 1573 } 1574 1575 if (!expr_invariant_in_loop_p (loop, inner_niter)) 1576 { 1577 if (dump_enabled_p ()) 1578 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1579 "not vectorized: inner-loop count not" 1580 " invariant.\n"); 1581 return false; 1582 } 1583 1584 if (dump_enabled_p ()) 1585 dump_printf_loc (MSG_NOTE, vect_location, 1586 "Considering outer-loop vectorization.\n"); 1587 } 1588 1589 if (!single_exit (loop) 1590 || EDGE_COUNT (loop->header->preds) != 2) 1591 { 1592 if (dump_enabled_p ()) 1593 { 1594 if (!single_exit (loop)) 1595 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1596 "not vectorized: multiple exits.\n"); 1597 else if (EDGE_COUNT (loop->header->preds) != 2) 1598 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1599 "not vectorized: too many incoming edges.\n"); 1600 } 1601 return false; 1602 } 1603 1604 /* We assume that the loop exit condition is at the end of the loop. i.e, 1605 that the loop is represented as a do-while (with a proper if-guard 1606 before the loop if needed), where the loop header contains all the 1607 executable statements, and the latch is empty. */ 1608 if (!empty_block_p (loop->latch) 1609 || !gimple_seq_empty_p (phi_nodes (loop->latch))) 1610 { 1611 if (dump_enabled_p ()) 1612 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1613 "not vectorized: latch block not empty.\n"); 1614 return false; 1615 } 1616 1617 /* Make sure the exit is not abnormal. */ 1618 edge e = single_exit (loop); 1619 if (e->flags & EDGE_ABNORMAL) 1620 { 1621 if (dump_enabled_p ()) 1622 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1623 "not vectorized: abnormal loop exit edge.\n"); 1624 return false; 1625 } 1626 1627 *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations, 1628 number_of_iterationsm1); 1629 if (!*loop_cond) 1630 { 1631 if (dump_enabled_p ()) 1632 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1633 "not vectorized: complicated exit condition.\n"); 1634 return false; 1635 } 1636 1637 if (integer_zerop (*assumptions) 1638 || !*number_of_iterations 1639 || chrec_contains_undetermined (*number_of_iterations)) 1640 { 1641 if (dump_enabled_p ()) 1642 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1643 "not vectorized: number of iterations cannot be " 1644 "computed.\n"); 1645 return false; 1646 } 1647 1648 if (integer_zerop (*number_of_iterations)) 1649 { 1650 if (dump_enabled_p ()) 1651 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1652 "not vectorized: number of iterations = 0.\n"); 1653 return false; 1654 } 1655 1656 return true; 1657 } 1658 1659 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */ 1660 1661 loop_vec_info 1662 vect_analyze_loop_form (struct loop *loop) 1663 { 1664 tree assumptions, number_of_iterations, number_of_iterationsm1; 1665 gcond *loop_cond, *inner_loop_cond = NULL; 1666 1667 if (! vect_analyze_loop_form_1 (loop, &loop_cond, 1668 &assumptions, &number_of_iterationsm1, 1669 &number_of_iterations, &inner_loop_cond)) 1670 return NULL; 1671 1672 loop_vec_info loop_vinfo = new _loop_vec_info (loop); 1673 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1; 1674 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations; 1675 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations; 1676 if (!integer_onep (assumptions)) 1677 { 1678 /* We consider to vectorize this loop by versioning it under 1679 some assumptions. In order to do this, we need to clear 1680 existing information computed by scev and niter analyzer. */ 1681 scev_reset_htab (); 1682 free_numbers_of_iterations_estimates (loop); 1683 /* Also set flag for this loop so that following scev and niter 1684 analysis are done under the assumptions. */ 1685 loop_constraint_set (loop, LOOP_C_FINITE); 1686 /* Also record the assumptions for versioning. */ 1687 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions; 1688 } 1689 1690 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) 1691 { 1692 if (dump_enabled_p ()) 1693 { 1694 dump_printf_loc (MSG_NOTE, vect_location, 1695 "Symbolic number of iterations is "); 1696 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations); 1697 dump_printf (MSG_NOTE, "\n"); 1698 } 1699 } 1700 1701 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type; 1702 if (inner_loop_cond) 1703 STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond)) 1704 = loop_exit_ctrl_vec_info_type; 1705 1706 gcc_assert (!loop->aux); 1707 loop->aux = loop_vinfo; 1708 return loop_vinfo; 1709 } 1710 1711 1712 1713 /* Scan the loop stmts and dependent on whether there are any (non-)SLP 1714 statements update the vectorization factor. */ 1715 1716 static void 1717 vect_update_vf_for_slp (loop_vec_info loop_vinfo) 1718 { 1719 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 1720 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); 1721 int nbbs = loop->num_nodes; 1722 poly_uint64 vectorization_factor; 1723 int i; 1724 1725 if (dump_enabled_p ()) 1726 dump_printf_loc (MSG_NOTE, vect_location, 1727 "=== vect_update_vf_for_slp ===\n"); 1728 1729 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 1730 gcc_assert (known_ne (vectorization_factor, 0U)); 1731 1732 /* If all the stmts in the loop can be SLPed, we perform only SLP, and 1733 vectorization factor of the loop is the unrolling factor required by 1734 the SLP instances. If that unrolling factor is 1, we say, that we 1735 perform pure SLP on loop - cross iteration parallelism is not 1736 exploited. */ 1737 bool only_slp_in_loop = true; 1738 for (i = 0; i < nbbs; i++) 1739 { 1740 basic_block bb = bbs[i]; 1741 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si); 1742 gsi_next (&si)) 1743 { 1744 gimple *stmt = gsi_stmt (si); 1745 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 1746 if (STMT_VINFO_IN_PATTERN_P (stmt_info) 1747 && STMT_VINFO_RELATED_STMT (stmt_info)) 1748 { 1749 stmt = STMT_VINFO_RELATED_STMT (stmt_info); 1750 stmt_info = vinfo_for_stmt (stmt); 1751 } 1752 if ((STMT_VINFO_RELEVANT_P (stmt_info) 1753 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info))) 1754 && !PURE_SLP_STMT (stmt_info)) 1755 /* STMT needs both SLP and loop-based vectorization. */ 1756 only_slp_in_loop = false; 1757 } 1758 } 1759 1760 if (only_slp_in_loop) 1761 { 1762 dump_printf_loc (MSG_NOTE, vect_location, 1763 "Loop contains only SLP stmts\n"); 1764 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo); 1765 } 1766 else 1767 { 1768 dump_printf_loc (MSG_NOTE, vect_location, 1769 "Loop contains SLP and non-SLP stmts\n"); 1770 /* Both the vectorization factor and unroll factor have the form 1771 current_vector_size * X for some rational X, so they must have 1772 a common multiple. */ 1773 vectorization_factor 1774 = force_common_multiple (vectorization_factor, 1775 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo)); 1776 } 1777 1778 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor; 1779 if (dump_enabled_p ()) 1780 { 1781 dump_printf_loc (MSG_NOTE, vect_location, 1782 "Updating vectorization factor to "); 1783 dump_dec (MSG_NOTE, vectorization_factor); 1784 dump_printf (MSG_NOTE, ".\n"); 1785 } 1786 } 1787 1788 /* Return true if STMT_INFO describes a double reduction phi and if 1789 the other phi in the reduction is also relevant for vectorization. 1790 This rejects cases such as: 1791 1792 outer1: 1793 x_1 = PHI <x_3(outer2), ...>; 1794 ... 1795 1796 inner: 1797 x_2 = ...; 1798 ... 1799 1800 outer2: 1801 x_3 = PHI <x_2(inner)>; 1802 1803 if nothing in x_2 or elsewhere makes x_1 relevant. */ 1804 1805 static bool 1806 vect_active_double_reduction_p (stmt_vec_info stmt_info) 1807 { 1808 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def) 1809 return false; 1810 1811 gimple *other_phi = STMT_VINFO_REDUC_DEF (stmt_info); 1812 return STMT_VINFO_RELEVANT_P (vinfo_for_stmt (other_phi)); 1813 } 1814 1815 /* Function vect_analyze_loop_operations. 1816 1817 Scan the loop stmts and make sure they are all vectorizable. */ 1818 1819 static bool 1820 vect_analyze_loop_operations (loop_vec_info loop_vinfo) 1821 { 1822 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 1823 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); 1824 int nbbs = loop->num_nodes; 1825 int i; 1826 stmt_vec_info stmt_info; 1827 bool need_to_vectorize = false; 1828 bool ok; 1829 1830 if (dump_enabled_p ()) 1831 dump_printf_loc (MSG_NOTE, vect_location, 1832 "=== vect_analyze_loop_operations ===\n"); 1833 1834 for (i = 0; i < nbbs; i++) 1835 { 1836 basic_block bb = bbs[i]; 1837 1838 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si); 1839 gsi_next (&si)) 1840 { 1841 gphi *phi = si.phi (); 1842 ok = true; 1843 1844 stmt_info = vinfo_for_stmt (phi); 1845 if (dump_enabled_p ()) 1846 { 1847 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: "); 1848 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0); 1849 } 1850 if (virtual_operand_p (gimple_phi_result (phi))) 1851 continue; 1852 1853 /* Inner-loop loop-closed exit phi in outer-loop vectorization 1854 (i.e., a phi in the tail of the outer-loop). */ 1855 if (! is_loop_header_bb_p (bb)) 1856 { 1857 /* FORNOW: we currently don't support the case that these phis 1858 are not used in the outerloop (unless it is double reduction, 1859 i.e., this phi is vect_reduction_def), cause this case 1860 requires to actually do something here. */ 1861 if (STMT_VINFO_LIVE_P (stmt_info) 1862 && !vect_active_double_reduction_p (stmt_info)) 1863 { 1864 if (dump_enabled_p ()) 1865 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1866 "Unsupported loop-closed phi in " 1867 "outer-loop.\n"); 1868 return false; 1869 } 1870 1871 /* If PHI is used in the outer loop, we check that its operand 1872 is defined in the inner loop. */ 1873 if (STMT_VINFO_RELEVANT_P (stmt_info)) 1874 { 1875 tree phi_op; 1876 gimple *op_def_stmt; 1877 1878 if (gimple_phi_num_args (phi) != 1) 1879 return false; 1880 1881 phi_op = PHI_ARG_DEF (phi, 0); 1882 if (TREE_CODE (phi_op) != SSA_NAME) 1883 return false; 1884 1885 op_def_stmt = SSA_NAME_DEF_STMT (phi_op); 1886 if (gimple_nop_p (op_def_stmt) 1887 || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt)) 1888 || !vinfo_for_stmt (op_def_stmt)) 1889 return false; 1890 1891 if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt)) 1892 != vect_used_in_outer 1893 && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt)) 1894 != vect_used_in_outer_by_reduction) 1895 return false; 1896 } 1897 1898 continue; 1899 } 1900 1901 gcc_assert (stmt_info); 1902 1903 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope 1904 || STMT_VINFO_LIVE_P (stmt_info)) 1905 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def) 1906 { 1907 /* A scalar-dependence cycle that we don't support. */ 1908 if (dump_enabled_p ()) 1909 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1910 "not vectorized: scalar dependence cycle.\n"); 1911 return false; 1912 } 1913 1914 if (STMT_VINFO_RELEVANT_P (stmt_info)) 1915 { 1916 need_to_vectorize = true; 1917 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def 1918 && ! PURE_SLP_STMT (stmt_info)) 1919 ok = vectorizable_induction (phi, NULL, NULL, NULL); 1920 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def 1921 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle) 1922 && ! PURE_SLP_STMT (stmt_info)) 1923 ok = vectorizable_reduction (phi, NULL, NULL, NULL, NULL); 1924 } 1925 1926 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */ 1927 if (ok 1928 && STMT_VINFO_LIVE_P (stmt_info) 1929 && !PURE_SLP_STMT (stmt_info)) 1930 ok = vectorizable_live_operation (phi, NULL, NULL, -1, NULL); 1931 1932 if (!ok) 1933 { 1934 if (dump_enabled_p ()) 1935 { 1936 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1937 "not vectorized: relevant phi not " 1938 "supported: "); 1939 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0); 1940 } 1941 return false; 1942 } 1943 } 1944 1945 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si); 1946 gsi_next (&si)) 1947 { 1948 gimple *stmt = gsi_stmt (si); 1949 if (!gimple_clobber_p (stmt) 1950 && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL, NULL)) 1951 return false; 1952 } 1953 } /* bbs */ 1954 1955 /* All operations in the loop are either irrelevant (deal with loop 1956 control, or dead), or only used outside the loop and can be moved 1957 out of the loop (e.g. invariants, inductions). The loop can be 1958 optimized away by scalar optimizations. We're better off not 1959 touching this loop. */ 1960 if (!need_to_vectorize) 1961 { 1962 if (dump_enabled_p ()) 1963 dump_printf_loc (MSG_NOTE, vect_location, 1964 "All the computation can be taken out of the loop.\n"); 1965 if (dump_enabled_p ()) 1966 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1967 "not vectorized: redundant loop. no profit to " 1968 "vectorize.\n"); 1969 return false; 1970 } 1971 1972 return true; 1973 } 1974 1975 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it 1976 is worthwhile to vectorize. Return 1 if definitely yes, 0 if 1977 definitely no, or -1 if it's worth retrying. */ 1978 1979 static int 1980 vect_analyze_loop_costing (loop_vec_info loop_vinfo) 1981 { 1982 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 1983 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo); 1984 1985 /* Only fully-masked loops can have iteration counts less than the 1986 vectorization factor. */ 1987 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) 1988 { 1989 HOST_WIDE_INT max_niter; 1990 1991 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) 1992 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo); 1993 else 1994 max_niter = max_stmt_executions_int (loop); 1995 1996 if (max_niter != -1 1997 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf) 1998 { 1999 if (dump_enabled_p ()) 2000 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2001 "not vectorized: iteration count smaller than " 2002 "vectorization factor.\n"); 2003 return 0; 2004 } 2005 } 2006 2007 int min_profitable_iters, min_profitable_estimate; 2008 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters, 2009 &min_profitable_estimate); 2010 2011 if (min_profitable_iters < 0) 2012 { 2013 if (dump_enabled_p ()) 2014 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2015 "not vectorized: vectorization not profitable.\n"); 2016 if (dump_enabled_p ()) 2017 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2018 "not vectorized: vector version will never be " 2019 "profitable.\n"); 2020 return -1; 2021 } 2022 2023 int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND) 2024 * assumed_vf); 2025 2026 /* Use the cost model only if it is more conservative than user specified 2027 threshold. */ 2028 unsigned int th = (unsigned) MAX (min_scalar_loop_bound, 2029 min_profitable_iters); 2030 2031 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th; 2032 2033 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) 2034 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th) 2035 { 2036 if (dump_enabled_p ()) 2037 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2038 "not vectorized: vectorization not profitable.\n"); 2039 if (dump_enabled_p ()) 2040 dump_printf_loc (MSG_NOTE, vect_location, 2041 "not vectorized: iteration count smaller than user " 2042 "specified loop bound parameter or minimum profitable " 2043 "iterations (whichever is more conservative).\n"); 2044 return 0; 2045 } 2046 2047 HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop); 2048 if (estimated_niter == -1) 2049 estimated_niter = likely_max_stmt_executions_int (loop); 2050 if (estimated_niter != -1 2051 && ((unsigned HOST_WIDE_INT) estimated_niter 2052 < MAX (th, (unsigned) min_profitable_estimate))) 2053 { 2054 if (dump_enabled_p ()) 2055 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2056 "not vectorized: estimated iteration count too " 2057 "small.\n"); 2058 if (dump_enabled_p ()) 2059 dump_printf_loc (MSG_NOTE, vect_location, 2060 "not vectorized: estimated iteration count smaller " 2061 "than specified loop bound parameter or minimum " 2062 "profitable iterations (whichever is more " 2063 "conservative).\n"); 2064 return -1; 2065 } 2066 2067 return 1; 2068 } 2069 2070 2071 /* Function vect_analyze_loop_2. 2072 2073 Apply a set of analyses on LOOP, and create a loop_vec_info struct 2074 for it. The different analyses will record information in the 2075 loop_vec_info struct. */ 2076 static bool 2077 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal) 2078 { 2079 bool ok; 2080 int res; 2081 unsigned int max_vf = MAX_VECTORIZATION_FACTOR; 2082 poly_uint64 min_vf = 2; 2083 unsigned int n_stmts = 0; 2084 2085 /* The first group of checks is independent of the vector size. */ 2086 fatal = true; 2087 2088 /* Find all data references in the loop (which correspond to vdefs/vuses) 2089 and analyze their evolution in the loop. */ 2090 2091 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); 2092 2093 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo); 2094 if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo))) 2095 { 2096 if (dump_enabled_p ()) 2097 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2098 "not vectorized: loop nest containing two " 2099 "or more consecutive inner loops cannot be " 2100 "vectorized\n"); 2101 return false; 2102 } 2103 2104 for (unsigned i = 0; i < loop->num_nodes; i++) 2105 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]); 2106 !gsi_end_p (gsi); gsi_next (&gsi)) 2107 { 2108 gimple *stmt = gsi_stmt (gsi); 2109 if (is_gimple_debug (stmt)) 2110 continue; 2111 ++n_stmts; 2112 if (!find_data_references_in_stmt (loop, stmt, 2113 &LOOP_VINFO_DATAREFS (loop_vinfo))) 2114 { 2115 if (is_gimple_call (stmt) && loop->safelen) 2116 { 2117 tree fndecl = gimple_call_fndecl (stmt), op; 2118 if (fndecl != NULL_TREE) 2119 { 2120 cgraph_node *node = cgraph_node::get (fndecl); 2121 if (node != NULL && node->simd_clones != NULL) 2122 { 2123 unsigned int j, n = gimple_call_num_args (stmt); 2124 for (j = 0; j < n; j++) 2125 { 2126 op = gimple_call_arg (stmt, j); 2127 if (DECL_P (op) 2128 || (REFERENCE_CLASS_P (op) 2129 && get_base_address (op))) 2130 break; 2131 } 2132 op = gimple_call_lhs (stmt); 2133 /* Ignore #pragma omp declare simd functions 2134 if they don't have data references in the 2135 call stmt itself. */ 2136 if (j == n 2137 && !(op 2138 && (DECL_P (op) 2139 || (REFERENCE_CLASS_P (op) 2140 && get_base_address (op))))) 2141 continue; 2142 } 2143 } 2144 } 2145 if (dump_enabled_p ()) 2146 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2147 "not vectorized: loop contains function " 2148 "calls or data references that cannot " 2149 "be analyzed\n"); 2150 return false; 2151 } 2152 } 2153 2154 /* Analyze the data references and also adjust the minimal 2155 vectorization factor according to the loads and stores. */ 2156 2157 ok = vect_analyze_data_refs (loop_vinfo, &min_vf); 2158 if (!ok) 2159 { 2160 if (dump_enabled_p ()) 2161 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2162 "bad data references.\n"); 2163 return false; 2164 } 2165 2166 /* Classify all cross-iteration scalar data-flow cycles. 2167 Cross-iteration cycles caused by virtual phis are analyzed separately. */ 2168 vect_analyze_scalar_cycles (loop_vinfo); 2169 2170 vect_pattern_recog (loop_vinfo); 2171 2172 vect_fixup_scalar_cycles_with_patterns (loop_vinfo); 2173 2174 /* Analyze the access patterns of the data-refs in the loop (consecutive, 2175 complex, etc.). FORNOW: Only handle consecutive access pattern. */ 2176 2177 ok = vect_analyze_data_ref_accesses (loop_vinfo); 2178 if (!ok) 2179 { 2180 if (dump_enabled_p ()) 2181 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2182 "bad data access.\n"); 2183 return false; 2184 } 2185 2186 /* Data-flow analysis to detect stmts that do not need to be vectorized. */ 2187 2188 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo); 2189 if (!ok) 2190 { 2191 if (dump_enabled_p ()) 2192 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2193 "unexpected pattern.\n"); 2194 return false; 2195 } 2196 2197 /* While the rest of the analysis below depends on it in some way. */ 2198 fatal = false; 2199 2200 /* Analyze data dependences between the data-refs in the loop 2201 and adjust the maximum vectorization factor according to 2202 the dependences. 2203 FORNOW: fail at the first data dependence that we encounter. */ 2204 2205 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf); 2206 if (!ok 2207 || (max_vf != MAX_VECTORIZATION_FACTOR 2208 && maybe_lt (max_vf, min_vf))) 2209 { 2210 if (dump_enabled_p ()) 2211 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2212 "bad data dependence.\n"); 2213 return false; 2214 } 2215 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf; 2216 2217 ok = vect_determine_vectorization_factor (loop_vinfo); 2218 if (!ok) 2219 { 2220 if (dump_enabled_p ()) 2221 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2222 "can't determine vectorization factor.\n"); 2223 return false; 2224 } 2225 if (max_vf != MAX_VECTORIZATION_FACTOR 2226 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo))) 2227 { 2228 if (dump_enabled_p ()) 2229 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2230 "bad data dependence.\n"); 2231 return false; 2232 } 2233 2234 /* Compute the scalar iteration cost. */ 2235 vect_compute_single_scalar_iteration_cost (loop_vinfo); 2236 2237 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 2238 unsigned th; 2239 2240 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */ 2241 ok = vect_analyze_slp (loop_vinfo, n_stmts); 2242 if (!ok) 2243 return false; 2244 2245 /* If there are any SLP instances mark them as pure_slp. */ 2246 bool slp = vect_make_slp_decision (loop_vinfo); 2247 if (slp) 2248 { 2249 /* Find stmts that need to be both vectorized and SLPed. */ 2250 vect_detect_hybrid_slp (loop_vinfo); 2251 2252 /* Update the vectorization factor based on the SLP decision. */ 2253 vect_update_vf_for_slp (loop_vinfo); 2254 } 2255 2256 bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo); 2257 2258 /* We don't expect to have to roll back to anything other than an empty 2259 set of rgroups. */ 2260 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ()); 2261 2262 /* This is the point where we can re-start analysis with SLP forced off. */ 2263 start_over: 2264 2265 /* Now the vectorization factor is final. */ 2266 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 2267 gcc_assert (known_ne (vectorization_factor, 0U)); 2268 2269 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ()) 2270 { 2271 dump_printf_loc (MSG_NOTE, vect_location, 2272 "vectorization_factor = "); 2273 dump_dec (MSG_NOTE, vectorization_factor); 2274 dump_printf (MSG_NOTE, ", niters = " HOST_WIDE_INT_PRINT_DEC "\n", 2275 LOOP_VINFO_INT_NITERS (loop_vinfo)); 2276 } 2277 2278 HOST_WIDE_INT max_niter 2279 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo)); 2280 2281 /* Analyze the alignment of the data-refs in the loop. 2282 Fail if a data reference is found that cannot be vectorized. */ 2283 2284 ok = vect_analyze_data_refs_alignment (loop_vinfo); 2285 if (!ok) 2286 { 2287 if (dump_enabled_p ()) 2288 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2289 "bad data alignment.\n"); 2290 return false; 2291 } 2292 2293 /* Prune the list of ddrs to be tested at run-time by versioning for alias. 2294 It is important to call pruning after vect_analyze_data_ref_accesses, 2295 since we use grouping information gathered by interleaving analysis. */ 2296 ok = vect_prune_runtime_alias_test_list (loop_vinfo); 2297 if (!ok) 2298 return false; 2299 2300 /* Do not invoke vect_enhance_data_refs_alignment for eplilogue 2301 vectorization. */ 2302 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)) 2303 { 2304 /* This pass will decide on using loop versioning and/or loop peeling in 2305 order to enhance the alignment of data references in the loop. */ 2306 ok = vect_enhance_data_refs_alignment (loop_vinfo); 2307 if (!ok) 2308 { 2309 if (dump_enabled_p ()) 2310 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2311 "bad data alignment.\n"); 2312 return false; 2313 } 2314 } 2315 2316 if (slp) 2317 { 2318 /* Analyze operations in the SLP instances. Note this may 2319 remove unsupported SLP instances which makes the above 2320 SLP kind detection invalid. */ 2321 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length (); 2322 vect_slp_analyze_operations (loop_vinfo); 2323 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size) 2324 goto again; 2325 } 2326 2327 /* Scan all the remaining operations in the loop that are not subject 2328 to SLP and make sure they are vectorizable. */ 2329 ok = vect_analyze_loop_operations (loop_vinfo); 2330 if (!ok) 2331 { 2332 if (dump_enabled_p ()) 2333 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2334 "bad operation or unsupported loop bound.\n"); 2335 return false; 2336 } 2337 2338 /* Decide whether to use a fully-masked loop for this vectorization 2339 factor. */ 2340 LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) 2341 = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) 2342 && vect_verify_full_masking (loop_vinfo)); 2343 if (dump_enabled_p ()) 2344 { 2345 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) 2346 dump_printf_loc (MSG_NOTE, vect_location, 2347 "using a fully-masked loop.\n"); 2348 else 2349 dump_printf_loc (MSG_NOTE, vect_location, 2350 "not using a fully-masked loop.\n"); 2351 } 2352 2353 /* If epilog loop is required because of data accesses with gaps, 2354 one additional iteration needs to be peeled. Check if there is 2355 enough iterations for vectorization. */ 2356 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) 2357 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) 2358 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) 2359 { 2360 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 2361 tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo); 2362 2363 if (known_lt (wi::to_widest (scalar_niters), vf)) 2364 { 2365 if (dump_enabled_p ()) 2366 dump_printf_loc (MSG_NOTE, vect_location, 2367 "loop has no enough iterations to support" 2368 " peeling for gaps.\n"); 2369 return false; 2370 } 2371 } 2372 2373 /* Check the costings of the loop make vectorizing worthwhile. */ 2374 res = vect_analyze_loop_costing (loop_vinfo); 2375 if (res < 0) 2376 goto again; 2377 if (!res) 2378 { 2379 if (dump_enabled_p ()) 2380 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2381 "Loop costings not worthwhile.\n"); 2382 return false; 2383 } 2384 2385 /* Decide whether we need to create an epilogue loop to handle 2386 remaining scalar iterations. */ 2387 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo); 2388 2389 unsigned HOST_WIDE_INT const_vf; 2390 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) 2391 /* The main loop handles all iterations. */ 2392 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false; 2393 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) 2394 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0) 2395 { 2396 /* Work out the (constant) number of iterations that need to be 2397 peeled for reasons other than niters. */ 2398 unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); 2399 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)) 2400 peel_niter += 1; 2401 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter, 2402 LOOP_VINFO_VECT_FACTOR (loop_vinfo))) 2403 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true; 2404 } 2405 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) 2406 /* ??? When peeling for gaps but not alignment, we could 2407 try to check whether the (variable) niters is known to be 2408 VF * N + 1. That's something of a niche case though. */ 2409 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) 2410 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf) 2411 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo)) 2412 < (unsigned) exact_log2 (const_vf)) 2413 /* In case of versioning, check if the maximum number of 2414 iterations is greater than th. If they are identical, 2415 the epilogue is unnecessary. */ 2416 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo) 2417 || ((unsigned HOST_WIDE_INT) max_niter 2418 > (th / const_vf) * const_vf)))) 2419 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true; 2420 2421 /* If an epilogue loop is required make sure we can create one. */ 2422 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) 2423 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)) 2424 { 2425 if (dump_enabled_p ()) 2426 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n"); 2427 if (!vect_can_advance_ivs_p (loop_vinfo) 2428 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo), 2429 single_exit (LOOP_VINFO_LOOP 2430 (loop_vinfo)))) 2431 { 2432 if (dump_enabled_p ()) 2433 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2434 "not vectorized: can't create required " 2435 "epilog loop\n"); 2436 goto again; 2437 } 2438 } 2439 2440 /* During peeling, we need to check if number of loop iterations is 2441 enough for both peeled prolog loop and vector loop. This check 2442 can be merged along with threshold check of loop versioning, so 2443 increase threshold for this case if necessary. */ 2444 if (LOOP_REQUIRES_VERSIONING (loop_vinfo)) 2445 { 2446 poly_uint64 niters_th = 0; 2447 2448 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo)) 2449 { 2450 /* Niters for peeled prolog loop. */ 2451 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0) 2452 { 2453 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo); 2454 tree vectype 2455 = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))); 2456 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1; 2457 } 2458 else 2459 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); 2460 } 2461 2462 /* Niters for at least one iteration of vectorized loop. */ 2463 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) 2464 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo); 2465 /* One additional iteration because of peeling for gap. */ 2466 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)) 2467 niters_th += 1; 2468 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th; 2469 } 2470 2471 gcc_assert (known_eq (vectorization_factor, 2472 LOOP_VINFO_VECT_FACTOR (loop_vinfo))); 2473 2474 /* Ok to vectorize! */ 2475 return true; 2476 2477 again: 2478 /* Try again with SLP forced off but if we didn't do any SLP there is 2479 no point in re-trying. */ 2480 if (!slp) 2481 return false; 2482 2483 /* If there are reduction chains re-trying will fail anyway. */ 2484 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ()) 2485 return false; 2486 2487 /* Likewise if the grouped loads or stores in the SLP cannot be handled 2488 via interleaving or lane instructions. */ 2489 slp_instance instance; 2490 slp_tree node; 2491 unsigned i, j; 2492 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance) 2493 { 2494 stmt_vec_info vinfo; 2495 vinfo = vinfo_for_stmt 2496 (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]); 2497 if (! STMT_VINFO_GROUPED_ACCESS (vinfo)) 2498 continue; 2499 vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo)); 2500 unsigned int size = STMT_VINFO_GROUP_SIZE (vinfo); 2501 tree vectype = STMT_VINFO_VECTYPE (vinfo); 2502 if (! vect_store_lanes_supported (vectype, size, false) 2503 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U) 2504 && ! vect_grouped_store_supported (vectype, size)) 2505 return false; 2506 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node) 2507 { 2508 vinfo = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]); 2509 vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo)); 2510 bool single_element_p = !STMT_VINFO_GROUP_NEXT_ELEMENT (vinfo); 2511 size = STMT_VINFO_GROUP_SIZE (vinfo); 2512 vectype = STMT_VINFO_VECTYPE (vinfo); 2513 if (! vect_load_lanes_supported (vectype, size, false) 2514 && ! vect_grouped_load_supported (vectype, single_element_p, 2515 size)) 2516 return false; 2517 } 2518 } 2519 2520 if (dump_enabled_p ()) 2521 dump_printf_loc (MSG_NOTE, vect_location, 2522 "re-trying with SLP disabled\n"); 2523 2524 /* Roll back state appropriately. No SLP this time. */ 2525 slp = false; 2526 /* Restore vectorization factor as it were without SLP. */ 2527 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor; 2528 /* Free the SLP instances. */ 2529 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance) 2530 vect_free_slp_instance (instance); 2531 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release (); 2532 /* Reset SLP type to loop_vect on all stmts. */ 2533 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i) 2534 { 2535 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i]; 2536 for (gimple_stmt_iterator si = gsi_start_phis (bb); 2537 !gsi_end_p (si); gsi_next (&si)) 2538 { 2539 stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si)); 2540 STMT_SLP_TYPE (stmt_info) = loop_vect; 2541 } 2542 for (gimple_stmt_iterator si = gsi_start_bb (bb); 2543 !gsi_end_p (si); gsi_next (&si)) 2544 { 2545 stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si)); 2546 STMT_SLP_TYPE (stmt_info) = loop_vect; 2547 if (STMT_VINFO_IN_PATTERN_P (stmt_info)) 2548 { 2549 stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info)); 2550 STMT_SLP_TYPE (stmt_info) = loop_vect; 2551 for (gimple_stmt_iterator pi 2552 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info)); 2553 !gsi_end_p (pi); gsi_next (&pi)) 2554 { 2555 gimple *pstmt = gsi_stmt (pi); 2556 STMT_SLP_TYPE (vinfo_for_stmt (pstmt)) = loop_vect; 2557 } 2558 } 2559 } 2560 } 2561 /* Free optimized alias test DDRS. */ 2562 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0); 2563 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release (); 2564 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release (); 2565 /* Reset target cost data. */ 2566 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)); 2567 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo) 2568 = init_cost (LOOP_VINFO_LOOP (loop_vinfo)); 2569 /* Reset accumulated rgroup information. */ 2570 release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo)); 2571 /* Reset assorted flags. */ 2572 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false; 2573 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false; 2574 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0; 2575 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0; 2576 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p; 2577 2578 goto start_over; 2579 } 2580 2581 /* Function vect_analyze_loop. 2582 2583 Apply a set of analyses on LOOP, and create a loop_vec_info struct 2584 for it. The different analyses will record information in the 2585 loop_vec_info struct. If ORIG_LOOP_VINFO is not NULL epilogue must 2586 be vectorized. */ 2587 loop_vec_info 2588 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo) 2589 { 2590 loop_vec_info loop_vinfo; 2591 auto_vector_sizes vector_sizes; 2592 2593 /* Autodetect first vector size we try. */ 2594 current_vector_size = 0; 2595 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes); 2596 unsigned int next_size = 0; 2597 2598 if (dump_enabled_p ()) 2599 dump_printf_loc (MSG_NOTE, vect_location, 2600 "===== analyze_loop_nest =====\n"); 2601 2602 if (loop_outer (loop) 2603 && loop_vec_info_for_loop (loop_outer (loop)) 2604 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop)))) 2605 { 2606 if (dump_enabled_p ()) 2607 dump_printf_loc (MSG_NOTE, vect_location, 2608 "outer-loop already vectorized.\n"); 2609 return NULL; 2610 } 2611 2612 poly_uint64 autodetected_vector_size = 0; 2613 while (1) 2614 { 2615 /* Check the CFG characteristics of the loop (nesting, entry/exit). */ 2616 loop_vinfo = vect_analyze_loop_form (loop); 2617 if (!loop_vinfo) 2618 { 2619 if (dump_enabled_p ()) 2620 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2621 "bad loop form.\n"); 2622 return NULL; 2623 } 2624 2625 bool fatal = false; 2626 2627 if (orig_loop_vinfo) 2628 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo; 2629 2630 if (vect_analyze_loop_2 (loop_vinfo, fatal)) 2631 { 2632 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1; 2633 2634 return loop_vinfo; 2635 } 2636 2637 delete loop_vinfo; 2638 2639 if (next_size == 0) 2640 autodetected_vector_size = current_vector_size; 2641 2642 if (next_size < vector_sizes.length () 2643 && known_eq (vector_sizes[next_size], autodetected_vector_size)) 2644 next_size += 1; 2645 2646 if (fatal 2647 || next_size == vector_sizes.length () 2648 || known_eq (current_vector_size, 0U)) 2649 return NULL; 2650 2651 /* Try the next biggest vector size. */ 2652 current_vector_size = vector_sizes[next_size++]; 2653 if (dump_enabled_p ()) 2654 { 2655 dump_printf_loc (MSG_NOTE, vect_location, 2656 "***** Re-trying analysis with " 2657 "vector size "); 2658 dump_dec (MSG_NOTE, current_vector_size); 2659 dump_printf (MSG_NOTE, "\n"); 2660 } 2661 } 2662 } 2663 2664 /* Return true if there is an in-order reduction function for CODE, storing 2665 it in *REDUC_FN if so. */ 2666 2667 static bool 2668 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn) 2669 { 2670 switch (code) 2671 { 2672 case PLUS_EXPR: 2673 *reduc_fn = IFN_FOLD_LEFT_PLUS; 2674 return true; 2675 2676 default: 2677 return false; 2678 } 2679 } 2680 2681 /* Function reduction_fn_for_scalar_code 2682 2683 Input: 2684 CODE - tree_code of a reduction operations. 2685 2686 Output: 2687 REDUC_FN - the corresponding internal function to be used to reduce the 2688 vector of partial results into a single scalar result, or IFN_LAST 2689 if the operation is a supported reduction operation, but does not have 2690 such an internal function. 2691 2692 Return FALSE if CODE currently cannot be vectorized as reduction. */ 2693 2694 static bool 2695 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn) 2696 { 2697 switch (code) 2698 { 2699 case MAX_EXPR: 2700 *reduc_fn = IFN_REDUC_MAX; 2701 return true; 2702 2703 case MIN_EXPR: 2704 *reduc_fn = IFN_REDUC_MIN; 2705 return true; 2706 2707 case PLUS_EXPR: 2708 *reduc_fn = IFN_REDUC_PLUS; 2709 return true; 2710 2711 case BIT_AND_EXPR: 2712 *reduc_fn = IFN_REDUC_AND; 2713 return true; 2714 2715 case BIT_IOR_EXPR: 2716 *reduc_fn = IFN_REDUC_IOR; 2717 return true; 2718 2719 case BIT_XOR_EXPR: 2720 *reduc_fn = IFN_REDUC_XOR; 2721 return true; 2722 2723 case MULT_EXPR: 2724 case MINUS_EXPR: 2725 *reduc_fn = IFN_LAST; 2726 return true; 2727 2728 default: 2729 return false; 2730 } 2731 } 2732 2733 /* If there is a neutral value X such that SLP reduction NODE would not 2734 be affected by the introduction of additional X elements, return that X, 2735 otherwise return null. CODE is the code of the reduction. REDUC_CHAIN 2736 is true if the SLP statements perform a single reduction, false if each 2737 statement performs an independent reduction. */ 2738 2739 static tree 2740 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code, 2741 bool reduc_chain) 2742 { 2743 vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node); 2744 gimple *stmt = stmts[0]; 2745 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt); 2746 tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo); 2747 tree scalar_type = TREE_TYPE (vector_type); 2748 struct loop *loop = gimple_bb (stmt)->loop_father; 2749 gcc_assert (loop); 2750 2751 switch (code) 2752 { 2753 case WIDEN_SUM_EXPR: 2754 case DOT_PROD_EXPR: 2755 case SAD_EXPR: 2756 case PLUS_EXPR: 2757 case MINUS_EXPR: 2758 case BIT_IOR_EXPR: 2759 case BIT_XOR_EXPR: 2760 return build_zero_cst (scalar_type); 2761 2762 case MULT_EXPR: 2763 return build_one_cst (scalar_type); 2764 2765 case BIT_AND_EXPR: 2766 return build_all_ones_cst (scalar_type); 2767 2768 case MAX_EXPR: 2769 case MIN_EXPR: 2770 /* For MIN/MAX the initial values are neutral. A reduction chain 2771 has only a single initial value, so that value is neutral for 2772 all statements. */ 2773 if (reduc_chain) 2774 return PHI_ARG_DEF_FROM_EDGE (stmt, loop_preheader_edge (loop)); 2775 return NULL_TREE; 2776 2777 default: 2778 return NULL_TREE; 2779 } 2780 } 2781 2782 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement 2783 STMT is printed with a message MSG. */ 2784 2785 static void 2786 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg) 2787 { 2788 dump_printf_loc (msg_type, vect_location, "%s", msg); 2789 dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0); 2790 } 2791 2792 2793 /* Detect SLP reduction of the form: 2794 2795 #a1 = phi <a5, a0> 2796 a2 = operation (a1) 2797 a3 = operation (a2) 2798 a4 = operation (a3) 2799 a5 = operation (a4) 2800 2801 #a = phi <a5> 2802 2803 PHI is the reduction phi node (#a1 = phi <a5, a0> above) 2804 FIRST_STMT is the first reduction stmt in the chain 2805 (a2 = operation (a1)). 2806 2807 Return TRUE if a reduction chain was detected. */ 2808 2809 static bool 2810 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi, 2811 gimple *first_stmt) 2812 { 2813 struct loop *loop = (gimple_bb (phi))->loop_father; 2814 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info); 2815 enum tree_code code; 2816 gimple *loop_use_stmt = NULL; 2817 stmt_vec_info use_stmt_info; 2818 tree lhs; 2819 imm_use_iterator imm_iter; 2820 use_operand_p use_p; 2821 int nloop_uses, size = 0, n_out_of_loop_uses; 2822 bool found = false; 2823 2824 if (loop != vect_loop) 2825 return false; 2826 2827 auto_vec<stmt_vec_info, 8> reduc_chain; 2828 lhs = PHI_RESULT (phi); 2829 code = gimple_assign_rhs_code (first_stmt); 2830 while (1) 2831 { 2832 nloop_uses = 0; 2833 n_out_of_loop_uses = 0; 2834 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs) 2835 { 2836 gimple *use_stmt = USE_STMT (use_p); 2837 if (is_gimple_debug (use_stmt)) 2838 continue; 2839 2840 /* Check if we got back to the reduction phi. */ 2841 if (use_stmt == phi) 2842 { 2843 loop_use_stmt = use_stmt; 2844 found = true; 2845 break; 2846 } 2847 2848 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))) 2849 { 2850 loop_use_stmt = use_stmt; 2851 nloop_uses++; 2852 } 2853 else 2854 n_out_of_loop_uses++; 2855 2856 /* There are can be either a single use in the loop or two uses in 2857 phi nodes. */ 2858 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses)) 2859 return false; 2860 } 2861 2862 if (found) 2863 break; 2864 2865 /* We reached a statement with no loop uses. */ 2866 if (nloop_uses == 0) 2867 return false; 2868 2869 /* This is a loop exit phi, and we haven't reached the reduction phi. */ 2870 if (gimple_code (loop_use_stmt) == GIMPLE_PHI) 2871 return false; 2872 2873 if (!is_gimple_assign (loop_use_stmt) 2874 || code != gimple_assign_rhs_code (loop_use_stmt) 2875 || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt))) 2876 return false; 2877 2878 /* Insert USE_STMT into reduction chain. */ 2879 use_stmt_info = vinfo_for_stmt (loop_use_stmt); 2880 reduc_chain.safe_push (use_stmt_info); 2881 2882 lhs = gimple_assign_lhs (loop_use_stmt); 2883 size++; 2884 } 2885 2886 if (!found || loop_use_stmt != phi || size < 2) 2887 return false; 2888 2889 /* Swap the operands, if needed, to make the reduction operand be the second 2890 operand. */ 2891 lhs = PHI_RESULT (phi); 2892 for (unsigned i = 0; i < reduc_chain.length (); ++i) 2893 { 2894 gassign *next_stmt = as_a <gassign *> (reduc_chain[i]->stmt); 2895 if (gimple_assign_rhs2 (next_stmt) == lhs) 2896 { 2897 tree op = gimple_assign_rhs1 (next_stmt); 2898 gimple *def_stmt = NULL; 2899 2900 if (TREE_CODE (op) == SSA_NAME) 2901 def_stmt = SSA_NAME_DEF_STMT (op); 2902 2903 /* Check that the other def is either defined in the loop 2904 ("vect_internal_def"), or it's an induction (defined by a 2905 loop-header phi-node). */ 2906 if (def_stmt 2907 && gimple_bb (def_stmt) 2908 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)) 2909 && (is_gimple_assign (def_stmt) 2910 || is_gimple_call (def_stmt) 2911 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt)) 2912 == vect_induction_def 2913 || (gimple_code (def_stmt) == GIMPLE_PHI 2914 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt)) 2915 == vect_internal_def 2916 && !is_loop_header_bb_p (gimple_bb (def_stmt))))) 2917 { 2918 lhs = gimple_assign_lhs (next_stmt); 2919 continue; 2920 } 2921 2922 return false; 2923 } 2924 else 2925 { 2926 tree op = gimple_assign_rhs2 (next_stmt); 2927 gimple *def_stmt = NULL; 2928 2929 if (TREE_CODE (op) == SSA_NAME) 2930 def_stmt = SSA_NAME_DEF_STMT (op); 2931 2932 /* Check that the other def is either defined in the loop 2933 ("vect_internal_def"), or it's an induction (defined by a 2934 loop-header phi-node). */ 2935 if (def_stmt 2936 && gimple_bb (def_stmt) 2937 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)) 2938 && (is_gimple_assign (def_stmt) 2939 || is_gimple_call (def_stmt) 2940 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt)) 2941 == vect_induction_def 2942 || (gimple_code (def_stmt) == GIMPLE_PHI 2943 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt)) 2944 == vect_internal_def 2945 && !is_loop_header_bb_p (gimple_bb (def_stmt))))) 2946 { 2947 if (dump_enabled_p ()) 2948 { 2949 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: "); 2950 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0); 2951 } 2952 2953 swap_ssa_operands (next_stmt, 2954 gimple_assign_rhs1_ptr (next_stmt), 2955 gimple_assign_rhs2_ptr (next_stmt)); 2956 update_stmt (next_stmt); 2957 2958 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt))) 2959 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true; 2960 } 2961 else 2962 return false; 2963 } 2964 2965 lhs = gimple_assign_lhs (next_stmt); 2966 } 2967 2968 /* Build up the actual chain. */ 2969 for (unsigned i = 0; i < reduc_chain.length () - 1; ++i) 2970 { 2971 GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0]->stmt; 2972 GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1]->stmt; 2973 } 2974 GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0]->stmt; 2975 GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL; 2976 2977 /* Save the chain for further analysis in SLP detection. */ 2978 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]->stmt); 2979 GROUP_SIZE (reduc_chain[0]) = size; 2980 2981 return true; 2982 } 2983 2984 /* Return true if we need an in-order reduction for operation CODE 2985 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer 2986 overflow must wrap. */ 2987 2988 static bool 2989 needs_fold_left_reduction_p (tree type, tree_code code, 2990 bool need_wrapping_integral_overflow) 2991 { 2992 /* CHECKME: check for !flag_finite_math_only too? */ 2993 if (SCALAR_FLOAT_TYPE_P (type)) 2994 switch (code) 2995 { 2996 case MIN_EXPR: 2997 case MAX_EXPR: 2998 return false; 2999 3000 default: 3001 return !flag_associative_math; 3002 } 3003 3004 if (INTEGRAL_TYPE_P (type)) 3005 { 3006 if (!operation_no_trapping_overflow (type, code)) 3007 return true; 3008 if (need_wrapping_integral_overflow 3009 && !TYPE_OVERFLOW_WRAPS (type) 3010 && operation_can_overflow (code)) 3011 return true; 3012 return false; 3013 } 3014 3015 if (SAT_FIXED_POINT_TYPE_P (type)) 3016 return true; 3017 3018 return false; 3019 } 3020 3021 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and 3022 reduction operation CODE has a handled computation expression. */ 3023 3024 bool 3025 check_reduction_path (location_t loc, loop_p loop, gphi *phi, tree loop_arg, 3026 enum tree_code code) 3027 { 3028 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path; 3029 auto_bitmap visited; 3030 tree lookfor = PHI_RESULT (phi); 3031 ssa_op_iter curri; 3032 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE); 3033 while (USE_FROM_PTR (curr) != loop_arg) 3034 curr = op_iter_next_use (&curri); 3035 curri.i = curri.numops; 3036 do 3037 { 3038 path.safe_push (std::make_pair (curri, curr)); 3039 tree use = USE_FROM_PTR (curr); 3040 if (use == lookfor) 3041 break; 3042 gimple *def = SSA_NAME_DEF_STMT (use); 3043 if (gimple_nop_p (def) 3044 || ! flow_bb_inside_loop_p (loop, gimple_bb (def))) 3045 { 3046 pop: 3047 do 3048 { 3049 std::pair<ssa_op_iter, use_operand_p> x = path.pop (); 3050 curri = x.first; 3051 curr = x.second; 3052 do 3053 curr = op_iter_next_use (&curri); 3054 /* Skip already visited or non-SSA operands (from iterating 3055 over PHI args). */ 3056 while (curr != NULL_USE_OPERAND_P 3057 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME 3058 || ! bitmap_set_bit (visited, 3059 SSA_NAME_VERSION 3060 (USE_FROM_PTR (curr))))); 3061 } 3062 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ()); 3063 if (curr == NULL_USE_OPERAND_P) 3064 break; 3065 } 3066 else 3067 { 3068 if (gimple_code (def) == GIMPLE_PHI) 3069 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE); 3070 else 3071 curr = op_iter_init_use (&curri, def, SSA_OP_USE); 3072 while (curr != NULL_USE_OPERAND_P 3073 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME 3074 || ! bitmap_set_bit (visited, 3075 SSA_NAME_VERSION 3076 (USE_FROM_PTR (curr))))) 3077 curr = op_iter_next_use (&curri); 3078 if (curr == NULL_USE_OPERAND_P) 3079 goto pop; 3080 } 3081 } 3082 while (1); 3083 if (dump_file && (dump_flags & TDF_DETAILS)) 3084 { 3085 dump_printf_loc (MSG_NOTE, loc, "reduction path: "); 3086 unsigned i; 3087 std::pair<ssa_op_iter, use_operand_p> *x; 3088 FOR_EACH_VEC_ELT (path, i, x) 3089 { 3090 dump_generic_expr (MSG_NOTE, TDF_SLIM, USE_FROM_PTR (x->second)); 3091 dump_printf (MSG_NOTE, " "); 3092 } 3093 dump_printf (MSG_NOTE, "\n"); 3094 } 3095 3096 /* Check whether the reduction path detected is valid. */ 3097 bool fail = path.length () == 0; 3098 bool neg = false; 3099 for (unsigned i = 1; i < path.length (); ++i) 3100 { 3101 gimple *use_stmt = USE_STMT (path[i].second); 3102 tree op = USE_FROM_PTR (path[i].second); 3103 if (! has_single_use (op) 3104 || ! is_gimple_assign (use_stmt)) 3105 { 3106 fail = true; 3107 break; 3108 } 3109 if (gimple_assign_rhs_code (use_stmt) != code) 3110 { 3111 if (code == PLUS_EXPR 3112 && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR) 3113 { 3114 /* Track whether we negate the reduction value each iteration. */ 3115 if (gimple_assign_rhs2 (use_stmt) == op) 3116 neg = ! neg; 3117 } 3118 else 3119 { 3120 fail = true; 3121 break; 3122 } 3123 } 3124 } 3125 return ! fail && ! neg; 3126 } 3127 3128 3129 /* Function vect_is_simple_reduction 3130 3131 (1) Detect a cross-iteration def-use cycle that represents a simple 3132 reduction computation. We look for the following pattern: 3133 3134 loop_header: 3135 a1 = phi < a0, a2 > 3136 a3 = ... 3137 a2 = operation (a3, a1) 3138 3139 or 3140 3141 a3 = ... 3142 loop_header: 3143 a1 = phi < a0, a2 > 3144 a2 = operation (a3, a1) 3145 3146 such that: 3147 1. operation is commutative and associative and it is safe to 3148 change the order of the computation 3149 2. no uses for a2 in the loop (a2 is used out of the loop) 3150 3. no uses of a1 in the loop besides the reduction operation 3151 4. no uses of a1 outside the loop. 3152 3153 Conditions 1,4 are tested here. 3154 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized. 3155 3156 (2) Detect a cross-iteration def-use cycle in nested loops, i.e., 3157 nested cycles. 3158 3159 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double 3160 reductions: 3161 3162 a1 = phi < a0, a2 > 3163 inner loop (def of a3) 3164 a2 = phi < a3 > 3165 3166 (4) Detect condition expressions, ie: 3167 for (int i = 0; i < N; i++) 3168 if (a[i] < val) 3169 ret_val = a[i]; 3170 3171 */ 3172 3173 static gimple * 3174 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi, 3175 bool *double_reduc, 3176 bool need_wrapping_integral_overflow, 3177 enum vect_reduction_type *v_reduc_type) 3178 { 3179 struct loop *loop = (gimple_bb (phi))->loop_father; 3180 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info); 3181 gimple *def_stmt, *def1 = NULL, *def2 = NULL, *phi_use_stmt = NULL; 3182 enum tree_code orig_code, code; 3183 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE; 3184 tree type; 3185 int nloop_uses; 3186 tree name; 3187 imm_use_iterator imm_iter; 3188 use_operand_p use_p; 3189 bool phi_def; 3190 3191 *double_reduc = false; 3192 *v_reduc_type = TREE_CODE_REDUCTION; 3193 3194 tree phi_name = PHI_RESULT (phi); 3195 /* ??? If there are no uses of the PHI result the inner loop reduction 3196 won't be detected as possibly double-reduction by vectorizable_reduction 3197 because that tries to walk the PHI arg from the preheader edge which 3198 can be constant. See PR60382. */ 3199 if (has_zero_uses (phi_name)) 3200 return NULL; 3201 nloop_uses = 0; 3202 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name) 3203 { 3204 gimple *use_stmt = USE_STMT (use_p); 3205 if (is_gimple_debug (use_stmt)) 3206 continue; 3207 3208 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))) 3209 { 3210 if (dump_enabled_p ()) 3211 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3212 "intermediate value used outside loop.\n"); 3213 3214 return NULL; 3215 } 3216 3217 nloop_uses++; 3218 if (nloop_uses > 1) 3219 { 3220 if (dump_enabled_p ()) 3221 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3222 "reduction value used in loop.\n"); 3223 return NULL; 3224 } 3225 3226 phi_use_stmt = use_stmt; 3227 } 3228 3229 edge latch_e = loop_latch_edge (loop); 3230 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e); 3231 if (TREE_CODE (loop_arg) != SSA_NAME) 3232 { 3233 if (dump_enabled_p ()) 3234 { 3235 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3236 "reduction: not ssa_name: "); 3237 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg); 3238 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 3239 } 3240 return NULL; 3241 } 3242 3243 def_stmt = SSA_NAME_DEF_STMT (loop_arg); 3244 if (is_gimple_assign (def_stmt)) 3245 { 3246 name = gimple_assign_lhs (def_stmt); 3247 phi_def = false; 3248 } 3249 else if (gimple_code (def_stmt) == GIMPLE_PHI) 3250 { 3251 name = PHI_RESULT (def_stmt); 3252 phi_def = true; 3253 } 3254 else 3255 { 3256 if (dump_enabled_p ()) 3257 { 3258 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3259 "reduction: unhandled reduction operation: "); 3260 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def_stmt, 0); 3261 } 3262 return NULL; 3263 } 3264 3265 if (! flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))) 3266 return NULL; 3267 3268 nloop_uses = 0; 3269 auto_vec<gphi *, 3> lcphis; 3270 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name) 3271 { 3272 gimple *use_stmt = USE_STMT (use_p); 3273 if (is_gimple_debug (use_stmt)) 3274 continue; 3275 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))) 3276 nloop_uses++; 3277 else 3278 /* We can have more than one loop-closed PHI. */ 3279 lcphis.safe_push (as_a <gphi *> (use_stmt)); 3280 if (nloop_uses > 1) 3281 { 3282 if (dump_enabled_p ()) 3283 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3284 "reduction used in loop.\n"); 3285 return NULL; 3286 } 3287 } 3288 3289 /* If DEF_STMT is a phi node itself, we expect it to have a single argument 3290 defined in the inner loop. */ 3291 if (phi_def) 3292 { 3293 op1 = PHI_ARG_DEF (def_stmt, 0); 3294 3295 if (gimple_phi_num_args (def_stmt) != 1 3296 || TREE_CODE (op1) != SSA_NAME) 3297 { 3298 if (dump_enabled_p ()) 3299 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3300 "unsupported phi node definition.\n"); 3301 3302 return NULL; 3303 } 3304 3305 def1 = SSA_NAME_DEF_STMT (op1); 3306 if (gimple_bb (def1) 3307 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)) 3308 && loop->inner 3309 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1)) 3310 && is_gimple_assign (def1) 3311 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt))) 3312 { 3313 if (dump_enabled_p ()) 3314 report_vect_op (MSG_NOTE, def_stmt, 3315 "detected double reduction: "); 3316 3317 *double_reduc = true; 3318 return def_stmt; 3319 } 3320 3321 return NULL; 3322 } 3323 3324 /* If we are vectorizing an inner reduction we are executing that 3325 in the original order only in case we are not dealing with a 3326 double reduction. */ 3327 bool check_reduction = true; 3328 if (flow_loop_nested_p (vect_loop, loop)) 3329 { 3330 gphi *lcphi; 3331 unsigned i; 3332 check_reduction = false; 3333 FOR_EACH_VEC_ELT (lcphis, i, lcphi) 3334 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi)) 3335 { 3336 gimple *use_stmt = USE_STMT (use_p); 3337 if (is_gimple_debug (use_stmt)) 3338 continue; 3339 if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt))) 3340 check_reduction = true; 3341 } 3342 } 3343 3344 bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop); 3345 code = orig_code = gimple_assign_rhs_code (def_stmt); 3346 3347 /* We can handle "res -= x[i]", which is non-associative by 3348 simply rewriting this into "res += -x[i]". Avoid changing 3349 gimple instruction for the first simple tests and only do this 3350 if we're allowed to change code at all. */ 3351 if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name) 3352 code = PLUS_EXPR; 3353 3354 if (code == COND_EXPR) 3355 { 3356 if (! nested_in_vect_loop) 3357 *v_reduc_type = COND_REDUCTION; 3358 3359 op3 = gimple_assign_rhs1 (def_stmt); 3360 if (COMPARISON_CLASS_P (op3)) 3361 { 3362 op4 = TREE_OPERAND (op3, 1); 3363 op3 = TREE_OPERAND (op3, 0); 3364 } 3365 if (op3 == phi_name || op4 == phi_name) 3366 { 3367 if (dump_enabled_p ()) 3368 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt, 3369 "reduction: condition depends on previous" 3370 " iteration: "); 3371 return NULL; 3372 } 3373 3374 op1 = gimple_assign_rhs2 (def_stmt); 3375 op2 = gimple_assign_rhs3 (def_stmt); 3376 } 3377 else if (!commutative_tree_code (code) || !associative_tree_code (code)) 3378 { 3379 if (dump_enabled_p ()) 3380 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt, 3381 "reduction: not commutative/associative: "); 3382 return NULL; 3383 } 3384 else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS) 3385 { 3386 op1 = gimple_assign_rhs1 (def_stmt); 3387 op2 = gimple_assign_rhs2 (def_stmt); 3388 } 3389 else 3390 { 3391 if (dump_enabled_p ()) 3392 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt, 3393 "reduction: not handled operation: "); 3394 return NULL; 3395 } 3396 3397 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME) 3398 { 3399 if (dump_enabled_p ()) 3400 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt, 3401 "reduction: both uses not ssa_names: "); 3402 3403 return NULL; 3404 } 3405 3406 type = TREE_TYPE (gimple_assign_lhs (def_stmt)); 3407 if ((TREE_CODE (op1) == SSA_NAME 3408 && !types_compatible_p (type,TREE_TYPE (op1))) 3409 || (TREE_CODE (op2) == SSA_NAME 3410 && !types_compatible_p (type, TREE_TYPE (op2))) 3411 || (op3 && TREE_CODE (op3) == SSA_NAME 3412 && !types_compatible_p (type, TREE_TYPE (op3))) 3413 || (op4 && TREE_CODE (op4) == SSA_NAME 3414 && !types_compatible_p (type, TREE_TYPE (op4)))) 3415 { 3416 if (dump_enabled_p ()) 3417 { 3418 dump_printf_loc (MSG_NOTE, vect_location, 3419 "reduction: multiple types: operation type: "); 3420 dump_generic_expr (MSG_NOTE, TDF_SLIM, type); 3421 dump_printf (MSG_NOTE, ", operands types: "); 3422 dump_generic_expr (MSG_NOTE, TDF_SLIM, 3423 TREE_TYPE (op1)); 3424 dump_printf (MSG_NOTE, ","); 3425 dump_generic_expr (MSG_NOTE, TDF_SLIM, 3426 TREE_TYPE (op2)); 3427 if (op3) 3428 { 3429 dump_printf (MSG_NOTE, ","); 3430 dump_generic_expr (MSG_NOTE, TDF_SLIM, 3431 TREE_TYPE (op3)); 3432 } 3433 3434 if (op4) 3435 { 3436 dump_printf (MSG_NOTE, ","); 3437 dump_generic_expr (MSG_NOTE, TDF_SLIM, 3438 TREE_TYPE (op4)); 3439 } 3440 dump_printf (MSG_NOTE, "\n"); 3441 } 3442 3443 return NULL; 3444 } 3445 3446 /* Check whether it's ok to change the order of the computation. 3447 Generally, when vectorizing a reduction we change the order of the 3448 computation. This may change the behavior of the program in some 3449 cases, so we need to check that this is ok. One exception is when 3450 vectorizing an outer-loop: the inner-loop is executed sequentially, 3451 and therefore vectorizing reductions in the inner-loop during 3452 outer-loop vectorization is safe. */ 3453 if (check_reduction 3454 && *v_reduc_type == TREE_CODE_REDUCTION 3455 && needs_fold_left_reduction_p (type, code, 3456 need_wrapping_integral_overflow)) 3457 *v_reduc_type = FOLD_LEFT_REDUCTION; 3458 3459 /* Reduction is safe. We're dealing with one of the following: 3460 1) integer arithmetic and no trapv 3461 2) floating point arithmetic, and special flags permit this optimization 3462 3) nested cycle (i.e., outer loop vectorization). */ 3463 if (TREE_CODE (op1) == SSA_NAME) 3464 def1 = SSA_NAME_DEF_STMT (op1); 3465 3466 if (TREE_CODE (op2) == SSA_NAME) 3467 def2 = SSA_NAME_DEF_STMT (op2); 3468 3469 if (code != COND_EXPR 3470 && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2)))) 3471 { 3472 if (dump_enabled_p ()) 3473 report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: "); 3474 return NULL; 3475 } 3476 3477 /* Check that one def is the reduction def, defined by PHI, 3478 the other def is either defined in the loop ("vect_internal_def"), 3479 or it's an induction (defined by a loop-header phi-node). */ 3480 3481 if (def2 && def2 == phi 3482 && (code == COND_EXPR 3483 || !def1 || gimple_nop_p (def1) 3484 || !flow_bb_inside_loop_p (loop, gimple_bb (def1)) 3485 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1)) 3486 && (is_gimple_assign (def1) 3487 || is_gimple_call (def1) 3488 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1)) 3489 == vect_induction_def 3490 || (gimple_code (def1) == GIMPLE_PHI 3491 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1)) 3492 == vect_internal_def 3493 && !is_loop_header_bb_p (gimple_bb (def1))))))) 3494 { 3495 if (dump_enabled_p ()) 3496 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: "); 3497 return def_stmt; 3498 } 3499 3500 if (def1 && def1 == phi 3501 && (code == COND_EXPR 3502 || !def2 || gimple_nop_p (def2) 3503 || !flow_bb_inside_loop_p (loop, gimple_bb (def2)) 3504 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2)) 3505 && (is_gimple_assign (def2) 3506 || is_gimple_call (def2) 3507 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2)) 3508 == vect_induction_def 3509 || (gimple_code (def2) == GIMPLE_PHI 3510 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2)) 3511 == vect_internal_def 3512 && !is_loop_header_bb_p (gimple_bb (def2))))))) 3513 { 3514 if (! nested_in_vect_loop && orig_code != MINUS_EXPR) 3515 { 3516 /* Check if we can swap operands (just for simplicity - so that 3517 the rest of the code can assume that the reduction variable 3518 is always the last (second) argument). */ 3519 if (code == COND_EXPR) 3520 { 3521 /* Swap cond_expr by inverting the condition. */ 3522 tree cond_expr = gimple_assign_rhs1 (def_stmt); 3523 enum tree_code invert_code = ERROR_MARK; 3524 enum tree_code cond_code = TREE_CODE (cond_expr); 3525 3526 if (TREE_CODE_CLASS (cond_code) == tcc_comparison) 3527 { 3528 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0)); 3529 invert_code = invert_tree_comparison (cond_code, honor_nans); 3530 } 3531 if (invert_code != ERROR_MARK) 3532 { 3533 TREE_SET_CODE (cond_expr, invert_code); 3534 swap_ssa_operands (def_stmt, 3535 gimple_assign_rhs2_ptr (def_stmt), 3536 gimple_assign_rhs3_ptr (def_stmt)); 3537 } 3538 else 3539 { 3540 if (dump_enabled_p ()) 3541 report_vect_op (MSG_NOTE, def_stmt, 3542 "detected reduction: cannot swap operands " 3543 "for cond_expr"); 3544 return NULL; 3545 } 3546 } 3547 else 3548 swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt), 3549 gimple_assign_rhs2_ptr (def_stmt)); 3550 3551 if (dump_enabled_p ()) 3552 report_vect_op (MSG_NOTE, def_stmt, 3553 "detected reduction: need to swap operands: "); 3554 3555 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt))) 3556 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true; 3557 } 3558 else 3559 { 3560 if (dump_enabled_p ()) 3561 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: "); 3562 } 3563 3564 return def_stmt; 3565 } 3566 3567 /* Try to find SLP reduction chain. */ 3568 if (! nested_in_vect_loop 3569 && code != COND_EXPR 3570 && orig_code != MINUS_EXPR 3571 && vect_is_slp_reduction (loop_info, phi, def_stmt)) 3572 { 3573 if (dump_enabled_p ()) 3574 report_vect_op (MSG_NOTE, def_stmt, 3575 "reduction: detected reduction chain: "); 3576 3577 return def_stmt; 3578 } 3579 3580 /* Dissolve group eventually half-built by vect_is_slp_reduction. */ 3581 gimple *first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (def_stmt)); 3582 while (first) 3583 { 3584 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first)); 3585 GROUP_FIRST_ELEMENT (vinfo_for_stmt (first)) = NULL; 3586 GROUP_NEXT_ELEMENT (vinfo_for_stmt (first)) = NULL; 3587 first = next; 3588 } 3589 3590 /* Look for the expression computing loop_arg from loop PHI result. */ 3591 if (check_reduction_path (vect_location, loop, as_a <gphi *> (phi), loop_arg, 3592 code)) 3593 return def_stmt; 3594 3595 if (dump_enabled_p ()) 3596 { 3597 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt, 3598 "reduction: unknown pattern: "); 3599 } 3600 3601 return NULL; 3602 } 3603 3604 /* Wrapper around vect_is_simple_reduction, which will modify code 3605 in-place if it enables detection of more reductions. Arguments 3606 as there. */ 3607 3608 gimple * 3609 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi, 3610 bool *double_reduc, 3611 bool need_wrapping_integral_overflow) 3612 { 3613 enum vect_reduction_type v_reduc_type; 3614 gimple *def = vect_is_simple_reduction (loop_info, phi, double_reduc, 3615 need_wrapping_integral_overflow, 3616 &v_reduc_type); 3617 if (def) 3618 { 3619 stmt_vec_info reduc_def_info = vinfo_for_stmt (phi); 3620 STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type; 3621 STMT_VINFO_REDUC_DEF (reduc_def_info) = def; 3622 reduc_def_info = vinfo_for_stmt (def); 3623 STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type; 3624 STMT_VINFO_REDUC_DEF (reduc_def_info) = phi; 3625 } 3626 return def; 3627 } 3628 3629 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */ 3630 int 3631 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue, 3632 int *peel_iters_epilogue, 3633 stmt_vector_for_cost *scalar_cost_vec, 3634 stmt_vector_for_cost *prologue_cost_vec, 3635 stmt_vector_for_cost *epilogue_cost_vec) 3636 { 3637 int retval = 0; 3638 int assumed_vf = vect_vf_for_cost (loop_vinfo); 3639 3640 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) 3641 { 3642 *peel_iters_epilogue = assumed_vf / 2; 3643 if (dump_enabled_p ()) 3644 dump_printf_loc (MSG_NOTE, vect_location, 3645 "cost model: epilogue peel iters set to vf/2 " 3646 "because loop iterations are unknown .\n"); 3647 3648 /* If peeled iterations are known but number of scalar loop 3649 iterations are unknown, count a taken branch per peeled loop. */ 3650 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken, 3651 NULL, 0, vect_prologue); 3652 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken, 3653 NULL, 0, vect_epilogue); 3654 } 3655 else 3656 { 3657 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo); 3658 peel_iters_prologue = niters < peel_iters_prologue ? 3659 niters : peel_iters_prologue; 3660 *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf; 3661 /* If we need to peel for gaps, but no peeling is required, we have to 3662 peel VF iterations. */ 3663 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue) 3664 *peel_iters_epilogue = assumed_vf; 3665 } 3666 3667 stmt_info_for_cost *si; 3668 int j; 3669 if (peel_iters_prologue) 3670 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si) 3671 { 3672 stmt_vec_info stmt_info 3673 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL; 3674 retval += record_stmt_cost (prologue_cost_vec, 3675 si->count * peel_iters_prologue, 3676 si->kind, stmt_info, si->misalign, 3677 vect_prologue); 3678 } 3679 if (*peel_iters_epilogue) 3680 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si) 3681 { 3682 stmt_vec_info stmt_info 3683 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL; 3684 retval += record_stmt_cost (epilogue_cost_vec, 3685 si->count * *peel_iters_epilogue, 3686 si->kind, stmt_info, si->misalign, 3687 vect_epilogue); 3688 } 3689 3690 return retval; 3691 } 3692 3693 /* Function vect_estimate_min_profitable_iters 3694 3695 Return the number of iterations required for the vector version of the 3696 loop to be profitable relative to the cost of the scalar version of the 3697 loop. 3698 3699 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold 3700 of iterations for vectorization. -1 value means loop vectorization 3701 is not profitable. This returned value may be used for dynamic 3702 profitability check. 3703 3704 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used 3705 for static check against estimated number of iterations. */ 3706 3707 static void 3708 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo, 3709 int *ret_min_profitable_niters, 3710 int *ret_min_profitable_estimate) 3711 { 3712 int min_profitable_iters; 3713 int min_profitable_estimate; 3714 int peel_iters_prologue; 3715 int peel_iters_epilogue; 3716 unsigned vec_inside_cost = 0; 3717 int vec_outside_cost = 0; 3718 unsigned vec_prologue_cost = 0; 3719 unsigned vec_epilogue_cost = 0; 3720 int scalar_single_iter_cost = 0; 3721 int scalar_outside_cost = 0; 3722 int assumed_vf = vect_vf_for_cost (loop_vinfo); 3723 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); 3724 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo); 3725 3726 /* Cost model disabled. */ 3727 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo))) 3728 { 3729 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n"); 3730 *ret_min_profitable_niters = 0; 3731 *ret_min_profitable_estimate = 0; 3732 return; 3733 } 3734 3735 /* Requires loop versioning tests to handle misalignment. */ 3736 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)) 3737 { 3738 /* FIXME: Make cost depend on complexity of individual check. */ 3739 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length (); 3740 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0, 3741 vect_prologue); 3742 dump_printf (MSG_NOTE, 3743 "cost model: Adding cost of checks for loop " 3744 "versioning to treat misalignment.\n"); 3745 } 3746 3747 /* Requires loop versioning with alias checks. */ 3748 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)) 3749 { 3750 /* FIXME: Make cost depend on complexity of individual check. */ 3751 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length (); 3752 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0, 3753 vect_prologue); 3754 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length (); 3755 if (len) 3756 /* Count LEN - 1 ANDs and LEN comparisons. */ 3757 (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt, 3758 NULL, 0, vect_prologue); 3759 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length (); 3760 if (len) 3761 { 3762 /* Count LEN - 1 ANDs and LEN comparisons. */ 3763 unsigned int nstmts = len * 2 - 1; 3764 /* +1 for each bias that needs adding. */ 3765 for (unsigned int i = 0; i < len; ++i) 3766 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p) 3767 nstmts += 1; 3768 (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt, 3769 NULL, 0, vect_prologue); 3770 } 3771 dump_printf (MSG_NOTE, 3772 "cost model: Adding cost of checks for loop " 3773 "versioning aliasing.\n"); 3774 } 3775 3776 /* Requires loop versioning with niter checks. */ 3777 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo)) 3778 { 3779 /* FIXME: Make cost depend on complexity of individual check. */ 3780 (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0, 3781 vect_prologue); 3782 dump_printf (MSG_NOTE, 3783 "cost model: Adding cost of checks for loop " 3784 "versioning niters.\n"); 3785 } 3786 3787 if (LOOP_REQUIRES_VERSIONING (loop_vinfo)) 3788 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0, 3789 vect_prologue); 3790 3791 /* Count statements in scalar loop. Using this as scalar cost for a single 3792 iteration for now. 3793 3794 TODO: Add outer loop support. 3795 3796 TODO: Consider assigning different costs to different scalar 3797 statements. */ 3798 3799 scalar_single_iter_cost 3800 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo); 3801 3802 /* Add additional cost for the peeled instructions in prologue and epilogue 3803 loop. (For fully-masked loops there will be no peeling.) 3804 3805 FORNOW: If we don't know the value of peel_iters for prologue or epilogue 3806 at compile-time - we assume it's vf/2 (the worst would be vf-1). 3807 3808 TODO: Build an expression that represents peel_iters for prologue and 3809 epilogue to be used in a run-time test. */ 3810 3811 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) 3812 { 3813 peel_iters_prologue = 0; 3814 peel_iters_epilogue = 0; 3815 3816 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)) 3817 { 3818 /* We need to peel exactly one iteration. */ 3819 peel_iters_epilogue += 1; 3820 stmt_info_for_cost *si; 3821 int j; 3822 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), 3823 j, si) 3824 { 3825 struct _stmt_vec_info *stmt_info 3826 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL; 3827 (void) add_stmt_cost (target_cost_data, si->count, 3828 si->kind, stmt_info, si->misalign, 3829 vect_epilogue); 3830 } 3831 } 3832 } 3833 else if (npeel < 0) 3834 { 3835 peel_iters_prologue = assumed_vf / 2; 3836 dump_printf (MSG_NOTE, "cost model: " 3837 "prologue peel iters set to vf/2.\n"); 3838 3839 /* If peeling for alignment is unknown, loop bound of main loop becomes 3840 unknown. */ 3841 peel_iters_epilogue = assumed_vf / 2; 3842 dump_printf (MSG_NOTE, "cost model: " 3843 "epilogue peel iters set to vf/2 because " 3844 "peeling for alignment is unknown.\n"); 3845 3846 /* If peeled iterations are unknown, count a taken branch and a not taken 3847 branch per peeled loop. Even if scalar loop iterations are known, 3848 vector iterations are not known since peeled prologue iterations are 3849 not known. Hence guards remain the same. */ 3850 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, 3851 NULL, 0, vect_prologue); 3852 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken, 3853 NULL, 0, vect_prologue); 3854 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, 3855 NULL, 0, vect_epilogue); 3856 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken, 3857 NULL, 0, vect_epilogue); 3858 stmt_info_for_cost *si; 3859 int j; 3860 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si) 3861 { 3862 struct _stmt_vec_info *stmt_info 3863 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL; 3864 (void) add_stmt_cost (target_cost_data, 3865 si->count * peel_iters_prologue, 3866 si->kind, stmt_info, si->misalign, 3867 vect_prologue); 3868 (void) add_stmt_cost (target_cost_data, 3869 si->count * peel_iters_epilogue, 3870 si->kind, stmt_info, si->misalign, 3871 vect_epilogue); 3872 } 3873 } 3874 else 3875 { 3876 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec; 3877 stmt_info_for_cost *si; 3878 int j; 3879 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo); 3880 3881 prologue_cost_vec.create (2); 3882 epilogue_cost_vec.create (2); 3883 peel_iters_prologue = npeel; 3884 3885 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue, 3886 &peel_iters_epilogue, 3887 &LOOP_VINFO_SCALAR_ITERATION_COST 3888 (loop_vinfo), 3889 &prologue_cost_vec, 3890 &epilogue_cost_vec); 3891 3892 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si) 3893 { 3894 struct _stmt_vec_info *stmt_info 3895 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL; 3896 (void) add_stmt_cost (data, si->count, si->kind, stmt_info, 3897 si->misalign, vect_prologue); 3898 } 3899 3900 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si) 3901 { 3902 struct _stmt_vec_info *stmt_info 3903 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL; 3904 (void) add_stmt_cost (data, si->count, si->kind, stmt_info, 3905 si->misalign, vect_epilogue); 3906 } 3907 3908 prologue_cost_vec.release (); 3909 epilogue_cost_vec.release (); 3910 } 3911 3912 /* FORNOW: The scalar outside cost is incremented in one of the 3913 following ways: 3914 3915 1. The vectorizer checks for alignment and aliasing and generates 3916 a condition that allows dynamic vectorization. A cost model 3917 check is ANDED with the versioning condition. Hence scalar code 3918 path now has the added cost of the versioning check. 3919 3920 if (cost > th & versioning_check) 3921 jmp to vector code 3922 3923 Hence run-time scalar is incremented by not-taken branch cost. 3924 3925 2. The vectorizer then checks if a prologue is required. If the 3926 cost model check was not done before during versioning, it has to 3927 be done before the prologue check. 3928 3929 if (cost <= th) 3930 prologue = scalar_iters 3931 if (prologue == 0) 3932 jmp to vector code 3933 else 3934 execute prologue 3935 if (prologue == num_iters) 3936 go to exit 3937 3938 Hence the run-time scalar cost is incremented by a taken branch, 3939 plus a not-taken branch, plus a taken branch cost. 3940 3941 3. The vectorizer then checks if an epilogue is required. If the 3942 cost model check was not done before during prologue check, it 3943 has to be done with the epilogue check. 3944 3945 if (prologue == 0) 3946 jmp to vector code 3947 else 3948 execute prologue 3949 if (prologue == num_iters) 3950 go to exit 3951 vector code: 3952 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0)) 3953 jmp to epilogue 3954 3955 Hence the run-time scalar cost should be incremented by 2 taken 3956 branches. 3957 3958 TODO: The back end may reorder the BBS's differently and reverse 3959 conditions/branch directions. Change the estimates below to 3960 something more reasonable. */ 3961 3962 /* If the number of iterations is known and we do not do versioning, we can 3963 decide whether to vectorize at compile time. Hence the scalar version 3964 do not carry cost model guard costs. */ 3965 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) 3966 || LOOP_REQUIRES_VERSIONING (loop_vinfo)) 3967 { 3968 /* Cost model check occurs at versioning. */ 3969 if (LOOP_REQUIRES_VERSIONING (loop_vinfo)) 3970 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken); 3971 else 3972 { 3973 /* Cost model check occurs at prologue generation. */ 3974 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0) 3975 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken) 3976 + vect_get_stmt_cost (cond_branch_not_taken); 3977 /* Cost model check occurs at epilogue generation. */ 3978 else 3979 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken); 3980 } 3981 } 3982 3983 /* Complete the target-specific cost calculations. */ 3984 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost, 3985 &vec_inside_cost, &vec_epilogue_cost); 3986 3987 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost); 3988 3989 if (dump_enabled_p ()) 3990 { 3991 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n"); 3992 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n", 3993 vec_inside_cost); 3994 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", 3995 vec_prologue_cost); 3996 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", 3997 vec_epilogue_cost); 3998 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n", 3999 scalar_single_iter_cost); 4000 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n", 4001 scalar_outside_cost); 4002 dump_printf (MSG_NOTE, " Vector outside cost: %d\n", 4003 vec_outside_cost); 4004 dump_printf (MSG_NOTE, " prologue iterations: %d\n", 4005 peel_iters_prologue); 4006 dump_printf (MSG_NOTE, " epilogue iterations: %d\n", 4007 peel_iters_epilogue); 4008 } 4009 4010 /* Calculate number of iterations required to make the vector version 4011 profitable, relative to the loop bodies only. The following condition 4012 must hold true: 4013 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC 4014 where 4015 SIC = scalar iteration cost, VIC = vector iteration cost, 4016 VOC = vector outside cost, VF = vectorization factor, 4017 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations 4018 SOC = scalar outside cost for run time cost model check. */ 4019 4020 if ((scalar_single_iter_cost * assumed_vf) > (int) vec_inside_cost) 4021 { 4022 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) 4023 * assumed_vf 4024 - vec_inside_cost * peel_iters_prologue 4025 - vec_inside_cost * peel_iters_epilogue); 4026 if (min_profitable_iters <= 0) 4027 min_profitable_iters = 0; 4028 else 4029 { 4030 min_profitable_iters /= ((scalar_single_iter_cost * assumed_vf) 4031 - vec_inside_cost); 4032 4033 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters) 4034 <= (((int) vec_inside_cost * min_profitable_iters) 4035 + (((int) vec_outside_cost - scalar_outside_cost) 4036 * assumed_vf))) 4037 min_profitable_iters++; 4038 } 4039 } 4040 /* vector version will never be profitable. */ 4041 else 4042 { 4043 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize) 4044 warning_at (vect_location, OPT_Wopenmp_simd, "vectorization " 4045 "did not happen for a simd loop"); 4046 4047 if (dump_enabled_p ()) 4048 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 4049 "cost model: the vector iteration cost = %d " 4050 "divided by the scalar iteration cost = %d " 4051 "is greater or equal to the vectorization factor = %d" 4052 ".\n", 4053 vec_inside_cost, scalar_single_iter_cost, assumed_vf); 4054 *ret_min_profitable_niters = -1; 4055 *ret_min_profitable_estimate = -1; 4056 return; 4057 } 4058 4059 dump_printf (MSG_NOTE, 4060 " Calculated minimum iters for profitability: %d\n", 4061 min_profitable_iters); 4062 4063 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) 4064 && min_profitable_iters < (assumed_vf + peel_iters_prologue)) 4065 /* We want the vectorized loop to execute at least once. */ 4066 min_profitable_iters = assumed_vf + peel_iters_prologue; 4067 4068 if (dump_enabled_p ()) 4069 dump_printf_loc (MSG_NOTE, vect_location, 4070 " Runtime profitability threshold = %d\n", 4071 min_profitable_iters); 4072 4073 *ret_min_profitable_niters = min_profitable_iters; 4074 4075 /* Calculate number of iterations required to make the vector version 4076 profitable, relative to the loop bodies only. 4077 4078 Non-vectorized variant is SIC * niters and it must win over vector 4079 variant on the expected loop trip count. The following condition must hold true: 4080 SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC */ 4081 4082 if (vec_outside_cost <= 0) 4083 min_profitable_estimate = 0; 4084 else 4085 { 4086 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) 4087 * assumed_vf 4088 - vec_inside_cost * peel_iters_prologue 4089 - vec_inside_cost * peel_iters_epilogue) 4090 / ((scalar_single_iter_cost * assumed_vf) 4091 - vec_inside_cost); 4092 } 4093 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters); 4094 if (dump_enabled_p ()) 4095 dump_printf_loc (MSG_NOTE, vect_location, 4096 " Static estimate profitability threshold = %d\n", 4097 min_profitable_estimate); 4098 4099 *ret_min_profitable_estimate = min_profitable_estimate; 4100 } 4101 4102 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET 4103 vector elements (not bits) for a vector with NELT elements. */ 4104 static void 4105 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt, 4106 vec_perm_builder *sel) 4107 { 4108 /* The encoding is a single stepped pattern. Any wrap-around is handled 4109 by vec_perm_indices. */ 4110 sel->new_vector (nelt, 1, 3); 4111 for (unsigned int i = 0; i < 3; i++) 4112 sel->quick_push (i + offset); 4113 } 4114 4115 /* Checks whether the target supports whole-vector shifts for vectors of mode 4116 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_ 4117 it supports vec_perm_const with masks for all necessary shift amounts. */ 4118 static bool 4119 have_whole_vector_shift (machine_mode mode) 4120 { 4121 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing) 4122 return true; 4123 4124 /* Variable-length vectors should be handled via the optab. */ 4125 unsigned int nelt; 4126 if (!GET_MODE_NUNITS (mode).is_constant (&nelt)) 4127 return false; 4128 4129 vec_perm_builder sel; 4130 vec_perm_indices indices; 4131 for (unsigned int i = nelt / 2; i >= 1; i /= 2) 4132 { 4133 calc_vec_perm_mask_for_shift (i, nelt, &sel); 4134 indices.new_vector (sel, 2, nelt); 4135 if (!can_vec_perm_const_p (mode, indices, false)) 4136 return false; 4137 } 4138 return true; 4139 } 4140 4141 /* TODO: Close dependency between vect_model_*_cost and vectorizable_* 4142 functions. Design better to avoid maintenance issues. */ 4143 4144 /* Function vect_model_reduction_cost. 4145 4146 Models cost for a reduction operation, including the vector ops 4147 generated within the strip-mine loop, the initial definition before 4148 the loop, and the epilogue code that must be generated. */ 4149 4150 static void 4151 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn, 4152 int ncopies) 4153 { 4154 int prologue_cost = 0, epilogue_cost = 0, inside_cost; 4155 enum tree_code code; 4156 optab optab; 4157 tree vectype; 4158 gimple *orig_stmt; 4159 machine_mode mode; 4160 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 4161 struct loop *loop = NULL; 4162 void *target_cost_data; 4163 4164 if (loop_vinfo) 4165 { 4166 loop = LOOP_VINFO_LOOP (loop_vinfo); 4167 target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo); 4168 } 4169 else 4170 target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info)); 4171 4172 /* Condition reductions generate two reductions in the loop. */ 4173 vect_reduction_type reduction_type 4174 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info); 4175 if (reduction_type == COND_REDUCTION) 4176 ncopies *= 2; 4177 4178 vectype = STMT_VINFO_VECTYPE (stmt_info); 4179 mode = TYPE_MODE (vectype); 4180 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info); 4181 4182 if (!orig_stmt) 4183 orig_stmt = STMT_VINFO_STMT (stmt_info); 4184 4185 code = gimple_assign_rhs_code (orig_stmt); 4186 4187 if (reduction_type == EXTRACT_LAST_REDUCTION 4188 || reduction_type == FOLD_LEFT_REDUCTION) 4189 { 4190 /* No extra instructions needed in the prologue. */ 4191 prologue_cost = 0; 4192 4193 if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST) 4194 /* Count one reduction-like operation per vector. */ 4195 inside_cost = add_stmt_cost (target_cost_data, ncopies, vec_to_scalar, 4196 stmt_info, 0, vect_body); 4197 else 4198 { 4199 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */ 4200 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype); 4201 inside_cost = add_stmt_cost (target_cost_data, nelements, 4202 vec_to_scalar, stmt_info, 0, 4203 vect_body); 4204 inside_cost += add_stmt_cost (target_cost_data, nelements, 4205 scalar_stmt, stmt_info, 0, 4206 vect_body); 4207 } 4208 } 4209 else 4210 { 4211 /* Add in cost for initial definition. 4212 For cond reduction we have four vectors: initial index, step, 4213 initial result of the data reduction, initial value of the index 4214 reduction. */ 4215 int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1; 4216 prologue_cost += add_stmt_cost (target_cost_data, prologue_stmts, 4217 scalar_to_vec, stmt_info, 0, 4218 vect_prologue); 4219 4220 /* Cost of reduction op inside loop. */ 4221 inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt, 4222 stmt_info, 0, vect_body); 4223 } 4224 4225 /* Determine cost of epilogue code. 4226 4227 We have a reduction operator that will reduce the vector in one statement. 4228 Also requires scalar extract. */ 4229 4230 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt)) 4231 { 4232 if (reduc_fn != IFN_LAST) 4233 { 4234 if (reduction_type == COND_REDUCTION) 4235 { 4236 /* An EQ stmt and an COND_EXPR stmt. */ 4237 epilogue_cost += add_stmt_cost (target_cost_data, 2, 4238 vector_stmt, stmt_info, 0, 4239 vect_epilogue); 4240 /* Reduction of the max index and a reduction of the found 4241 values. */ 4242 epilogue_cost += add_stmt_cost (target_cost_data, 2, 4243 vec_to_scalar, stmt_info, 0, 4244 vect_epilogue); 4245 /* A broadcast of the max value. */ 4246 epilogue_cost += add_stmt_cost (target_cost_data, 1, 4247 scalar_to_vec, stmt_info, 0, 4248 vect_epilogue); 4249 } 4250 else 4251 { 4252 epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt, 4253 stmt_info, 0, vect_epilogue); 4254 epilogue_cost += add_stmt_cost (target_cost_data, 1, 4255 vec_to_scalar, stmt_info, 0, 4256 vect_epilogue); 4257 } 4258 } 4259 else if (reduction_type == COND_REDUCTION) 4260 { 4261 unsigned estimated_nunits = vect_nunits_for_cost (vectype); 4262 /* Extraction of scalar elements. */ 4263 epilogue_cost += add_stmt_cost (target_cost_data, 4264 2 * estimated_nunits, 4265 vec_to_scalar, stmt_info, 0, 4266 vect_epilogue); 4267 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */ 4268 epilogue_cost += add_stmt_cost (target_cost_data, 4269 2 * estimated_nunits - 3, 4270 scalar_stmt, stmt_info, 0, 4271 vect_epilogue); 4272 } 4273 else if (reduction_type == EXTRACT_LAST_REDUCTION 4274 || reduction_type == FOLD_LEFT_REDUCTION) 4275 /* No extra instructions need in the epilogue. */ 4276 ; 4277 else 4278 { 4279 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype)); 4280 tree bitsize = 4281 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt))); 4282 int element_bitsize = tree_to_uhwi (bitsize); 4283 int nelements = vec_size_in_bits / element_bitsize; 4284 4285 if (code == COND_EXPR) 4286 code = MAX_EXPR; 4287 4288 optab = optab_for_tree_code (code, vectype, optab_default); 4289 4290 /* We have a whole vector shift available. */ 4291 if (optab != unknown_optab 4292 && VECTOR_MODE_P (mode) 4293 && optab_handler (optab, mode) != CODE_FOR_nothing 4294 && have_whole_vector_shift (mode)) 4295 { 4296 /* Final reduction via vector shifts and the reduction operator. 4297 Also requires scalar extract. */ 4298 epilogue_cost += add_stmt_cost (target_cost_data, 4299 exact_log2 (nelements) * 2, 4300 vector_stmt, stmt_info, 0, 4301 vect_epilogue); 4302 epilogue_cost += add_stmt_cost (target_cost_data, 1, 4303 vec_to_scalar, stmt_info, 0, 4304 vect_epilogue); 4305 } 4306 else 4307 /* Use extracts and reduction op for final reduction. For N 4308 elements, we have N extracts and N-1 reduction ops. */ 4309 epilogue_cost += add_stmt_cost (target_cost_data, 4310 nelements + nelements - 1, 4311 vector_stmt, stmt_info, 0, 4312 vect_epilogue); 4313 } 4314 } 4315 4316 if (dump_enabled_p ()) 4317 dump_printf (MSG_NOTE, 4318 "vect_model_reduction_cost: inside_cost = %d, " 4319 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost, 4320 prologue_cost, epilogue_cost); 4321 } 4322 4323 4324 /* Function vect_model_induction_cost. 4325 4326 Models cost for induction operations. */ 4327 4328 static void 4329 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies) 4330 { 4331 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 4332 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo); 4333 unsigned inside_cost, prologue_cost; 4334 4335 if (PURE_SLP_STMT (stmt_info)) 4336 return; 4337 4338 /* loop cost for vec_loop. */ 4339 inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt, 4340 stmt_info, 0, vect_body); 4341 4342 /* prologue cost for vec_init and vec_step. */ 4343 prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec, 4344 stmt_info, 0, vect_prologue); 4345 4346 if (dump_enabled_p ()) 4347 dump_printf_loc (MSG_NOTE, vect_location, 4348 "vect_model_induction_cost: inside_cost = %d, " 4349 "prologue_cost = %d .\n", inside_cost, prologue_cost); 4350 } 4351 4352 4353 4354 /* Function get_initial_def_for_reduction 4355 4356 Input: 4357 STMT - a stmt that performs a reduction operation in the loop. 4358 INIT_VAL - the initial value of the reduction variable 4359 4360 Output: 4361 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result 4362 of the reduction (used for adjusting the epilog - see below). 4363 Return a vector variable, initialized according to the operation that STMT 4364 performs. This vector will be used as the initial value of the 4365 vector of partial results. 4366 4367 Option1 (adjust in epilog): Initialize the vector as follows: 4368 add/bit or/xor: [0,0,...,0,0] 4369 mult/bit and: [1,1,...,1,1] 4370 min/max/cond_expr: [init_val,init_val,..,init_val,init_val] 4371 and when necessary (e.g. add/mult case) let the caller know 4372 that it needs to adjust the result by init_val. 4373 4374 Option2: Initialize the vector as follows: 4375 add/bit or/xor: [init_val,0,0,...,0] 4376 mult/bit and: [init_val,1,1,...,1] 4377 min/max/cond_expr: [init_val,init_val,...,init_val] 4378 and no adjustments are needed. 4379 4380 For example, for the following code: 4381 4382 s = init_val; 4383 for (i=0;i<n;i++) 4384 s = s + a[i]; 4385 4386 STMT is 's = s + a[i]', and the reduction variable is 's'. 4387 For a vector of 4 units, we want to return either [0,0,0,init_val], 4388 or [0,0,0,0] and let the caller know that it needs to adjust 4389 the result at the end by 'init_val'. 4390 4391 FORNOW, we are using the 'adjust in epilog' scheme, because this way the 4392 initialization vector is simpler (same element in all entries), if 4393 ADJUSTMENT_DEF is not NULL, and Option2 otherwise. 4394 4395 A cost model should help decide between these two schemes. */ 4396 4397 tree 4398 get_initial_def_for_reduction (gimple *stmt, tree init_val, 4399 tree *adjustment_def) 4400 { 4401 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt); 4402 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo); 4403 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 4404 tree scalar_type = TREE_TYPE (init_val); 4405 tree vectype = get_vectype_for_scalar_type (scalar_type); 4406 enum tree_code code = gimple_assign_rhs_code (stmt); 4407 tree def_for_init; 4408 tree init_def; 4409 bool nested_in_vect_loop = false; 4410 REAL_VALUE_TYPE real_init_val = dconst0; 4411 int int_init_val = 0; 4412 gimple *def_stmt = NULL; 4413 gimple_seq stmts = NULL; 4414 4415 gcc_assert (vectype); 4416 4417 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type) 4418 || SCALAR_FLOAT_TYPE_P (scalar_type)); 4419 4420 if (nested_in_vect_loop_p (loop, stmt)) 4421 nested_in_vect_loop = true; 4422 else 4423 gcc_assert (loop == (gimple_bb (stmt))->loop_father); 4424 4425 /* In case of double reduction we only create a vector variable to be put 4426 in the reduction phi node. The actual statement creation is done in 4427 vect_create_epilog_for_reduction. */ 4428 if (adjustment_def && nested_in_vect_loop 4429 && TREE_CODE (init_val) == SSA_NAME 4430 && (def_stmt = SSA_NAME_DEF_STMT (init_val)) 4431 && gimple_code (def_stmt) == GIMPLE_PHI 4432 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)) 4433 && vinfo_for_stmt (def_stmt) 4434 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt)) 4435 == vect_double_reduction_def) 4436 { 4437 *adjustment_def = NULL; 4438 return vect_create_destination_var (init_val, vectype); 4439 } 4440 4441 vect_reduction_type reduction_type 4442 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo); 4443 4444 /* In case of a nested reduction do not use an adjustment def as 4445 that case is not supported by the epilogue generation correctly 4446 if ncopies is not one. */ 4447 if (adjustment_def && nested_in_vect_loop) 4448 { 4449 *adjustment_def = NULL; 4450 return vect_get_vec_def_for_operand (init_val, stmt); 4451 } 4452 4453 switch (code) 4454 { 4455 case WIDEN_SUM_EXPR: 4456 case DOT_PROD_EXPR: 4457 case SAD_EXPR: 4458 case PLUS_EXPR: 4459 case MINUS_EXPR: 4460 case BIT_IOR_EXPR: 4461 case BIT_XOR_EXPR: 4462 case MULT_EXPR: 4463 case BIT_AND_EXPR: 4464 { 4465 /* ADJUSTMENT_DEF is NULL when called from 4466 vect_create_epilog_for_reduction to vectorize double reduction. */ 4467 if (adjustment_def) 4468 *adjustment_def = init_val; 4469 4470 if (code == MULT_EXPR) 4471 { 4472 real_init_val = dconst1; 4473 int_init_val = 1; 4474 } 4475 4476 if (code == BIT_AND_EXPR) 4477 int_init_val = -1; 4478 4479 if (SCALAR_FLOAT_TYPE_P (scalar_type)) 4480 def_for_init = build_real (scalar_type, real_init_val); 4481 else 4482 def_for_init = build_int_cst (scalar_type, int_init_val); 4483 4484 if (adjustment_def) 4485 /* Option1: the first element is '0' or '1' as well. */ 4486 init_def = gimple_build_vector_from_val (&stmts, vectype, 4487 def_for_init); 4488 else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ()) 4489 { 4490 /* Option2 (variable length): the first element is INIT_VAL. */ 4491 init_def = build_vector_from_val (vectype, def_for_init); 4492 gcall *call = gimple_build_call_internal (IFN_VEC_SHL_INSERT, 4493 2, init_def, init_val); 4494 init_def = make_ssa_name (vectype); 4495 gimple_call_set_lhs (call, init_def); 4496 gimple_seq_add_stmt (&stmts, call); 4497 } 4498 else 4499 { 4500 /* Option2: the first element is INIT_VAL. */ 4501 tree_vector_builder elts (vectype, 1, 2); 4502 elts.quick_push (init_val); 4503 elts.quick_push (def_for_init); 4504 init_def = gimple_build_vector (&stmts, &elts); 4505 } 4506 } 4507 break; 4508 4509 case MIN_EXPR: 4510 case MAX_EXPR: 4511 case COND_EXPR: 4512 { 4513 if (adjustment_def) 4514 { 4515 *adjustment_def = NULL_TREE; 4516 if (reduction_type != COND_REDUCTION 4517 && reduction_type != EXTRACT_LAST_REDUCTION) 4518 { 4519 init_def = vect_get_vec_def_for_operand (init_val, stmt); 4520 break; 4521 } 4522 } 4523 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val); 4524 init_def = gimple_build_vector_from_val (&stmts, vectype, init_val); 4525 } 4526 break; 4527 4528 default: 4529 gcc_unreachable (); 4530 } 4531 4532 if (stmts) 4533 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts); 4534 return init_def; 4535 } 4536 4537 /* Get at the initial defs for the reduction PHIs in SLP_NODE. 4538 NUMBER_OF_VECTORS is the number of vector defs to create. 4539 If NEUTRAL_OP is nonnull, introducing extra elements of that 4540 value will not change the result. */ 4541 4542 static void 4543 get_initial_defs_for_reduction (slp_tree slp_node, 4544 vec<tree> *vec_oprnds, 4545 unsigned int number_of_vectors, 4546 bool reduc_chain, tree neutral_op) 4547 { 4548 vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node); 4549 gimple *stmt = stmts[0]; 4550 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt); 4551 unsigned HOST_WIDE_INT nunits; 4552 unsigned j, number_of_places_left_in_vector; 4553 tree vector_type; 4554 unsigned int group_size = stmts.length (); 4555 unsigned int i; 4556 struct loop *loop; 4557 4558 vector_type = STMT_VINFO_VECTYPE (stmt_vinfo); 4559 4560 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def); 4561 4562 loop = (gimple_bb (stmt))->loop_father; 4563 gcc_assert (loop); 4564 edge pe = loop_preheader_edge (loop); 4565 4566 gcc_assert (!reduc_chain || neutral_op); 4567 4568 /* NUMBER_OF_COPIES is the number of times we need to use the same values in 4569 created vectors. It is greater than 1 if unrolling is performed. 4570 4571 For example, we have two scalar operands, s1 and s2 (e.g., group of 4572 strided accesses of size two), while NUNITS is four (i.e., four scalars 4573 of this type can be packed in a vector). The output vector will contain 4574 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES 4575 will be 2). 4576 4577 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors 4578 containing the operands. 4579 4580 For example, NUNITS is four as before, and the group size is 8 4581 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and 4582 {s5, s6, s7, s8}. */ 4583 4584 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits)) 4585 nunits = group_size; 4586 4587 number_of_places_left_in_vector = nunits; 4588 bool constant_p = true; 4589 tree_vector_builder elts (vector_type, nunits, 1); 4590 elts.quick_grow (nunits); 4591 gimple_seq ctor_seq = NULL; 4592 for (j = 0; j < nunits * number_of_vectors; ++j) 4593 { 4594 tree op; 4595 i = j % group_size; 4596 stmt_vinfo = vinfo_for_stmt (stmts[i]); 4597 4598 /* Get the def before the loop. In reduction chain we have only 4599 one initial value. Else we have as many as PHIs in the group. */ 4600 if (reduc_chain) 4601 op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe); 4602 else if (((vec_oprnds->length () + 1) * nunits 4603 - number_of_places_left_in_vector >= group_size) 4604 && neutral_op) 4605 op = neutral_op; 4606 else 4607 op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe); 4608 4609 /* Create 'vect_ = {op0,op1,...,opn}'. */ 4610 number_of_places_left_in_vector--; 4611 elts[nunits - number_of_places_left_in_vector - 1] = op; 4612 if (!CONSTANT_CLASS_P (op)) 4613 constant_p = false; 4614 4615 if (number_of_places_left_in_vector == 0) 4616 { 4617 tree init; 4618 if (constant_p && !neutral_op 4619 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits) 4620 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits)) 4621 /* Build the vector directly from ELTS. */ 4622 init = gimple_build_vector (&ctor_seq, &elts); 4623 else if (neutral_op) 4624 { 4625 /* Build a vector of the neutral value and shift the 4626 other elements into place. */ 4627 init = gimple_build_vector_from_val (&ctor_seq, vector_type, 4628 neutral_op); 4629 int k = nunits; 4630 while (k > 0 && elts[k - 1] == neutral_op) 4631 k -= 1; 4632 while (k > 0) 4633 { 4634 k -= 1; 4635 gcall *call = gimple_build_call_internal 4636 (IFN_VEC_SHL_INSERT, 2, init, elts[k]); 4637 init = make_ssa_name (vector_type); 4638 gimple_call_set_lhs (call, init); 4639 gimple_seq_add_stmt (&ctor_seq, call); 4640 } 4641 } 4642 else 4643 { 4644 /* First time round, duplicate ELTS to fill the 4645 required number of vectors. */ 4646 duplicate_and_interleave (&ctor_seq, vector_type, elts, 4647 number_of_vectors, *vec_oprnds); 4648 break; 4649 } 4650 vec_oprnds->quick_push (init); 4651 4652 number_of_places_left_in_vector = nunits; 4653 elts.new_vector (vector_type, nunits, 1); 4654 elts.quick_grow (nunits); 4655 constant_p = true; 4656 } 4657 } 4658 if (ctor_seq != NULL) 4659 gsi_insert_seq_on_edge_immediate (pe, ctor_seq); 4660 } 4661 4662 4663 /* Function vect_create_epilog_for_reduction 4664 4665 Create code at the loop-epilog to finalize the result of a reduction 4666 computation. 4667 4668 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector 4669 reduction statements. 4670 STMT is the scalar reduction stmt that is being vectorized. 4671 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the 4672 number of elements that we can fit in a vectype (nunits). In this case 4673 we have to generate more than one vector stmt - i.e - we need to "unroll" 4674 the vector stmt by a factor VF/nunits. For more details see documentation 4675 in vectorizable_operation. 4676 REDUC_FN is the internal function for the epilog reduction. 4677 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction 4678 computation. 4679 REDUC_INDEX is the index of the operand in the right hand side of the 4680 statement that is defined by REDUCTION_PHI. 4681 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled. 4682 SLP_NODE is an SLP node containing a group of reduction statements. The 4683 first one in this group is STMT. 4684 INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case 4685 when the COND_EXPR is never true in the loop. For MAX_EXPR, it needs to 4686 be smaller than any value of the IV in the loop, for MIN_EXPR larger than 4687 any value of the IV in the loop. 4688 INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION. 4689 NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is 4690 null if this is not an SLP reduction 4691 4692 This function: 4693 1. Creates the reduction def-use cycles: sets the arguments for 4694 REDUCTION_PHIS: 4695 The loop-entry argument is the vectorized initial-value of the reduction. 4696 The loop-latch argument is taken from VECT_DEFS - the vector of partial 4697 sums. 4698 2. "Reduces" each vector of partial results VECT_DEFS into a single result, 4699 by calling the function specified by REDUC_FN if available, or by 4700 other means (whole-vector shifts or a scalar loop). 4701 The function also creates a new phi node at the loop exit to preserve 4702 loop-closed form, as illustrated below. 4703 4704 The flow at the entry to this function: 4705 4706 loop: 4707 vec_def = phi <null, null> # REDUCTION_PHI 4708 VECT_DEF = vector_stmt # vectorized form of STMT 4709 s_loop = scalar_stmt # (scalar) STMT 4710 loop_exit: 4711 s_out0 = phi <s_loop> # (scalar) EXIT_PHI 4712 use <s_out0> 4713 use <s_out0> 4714 4715 The above is transformed by this function into: 4716 4717 loop: 4718 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI 4719 VECT_DEF = vector_stmt # vectorized form of STMT 4720 s_loop = scalar_stmt # (scalar) STMT 4721 loop_exit: 4722 s_out0 = phi <s_loop> # (scalar) EXIT_PHI 4723 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI 4724 v_out2 = reduce <v_out1> 4725 s_out3 = extract_field <v_out2, 0> 4726 s_out4 = adjust_result <s_out3> 4727 use <s_out4> 4728 use <s_out4> 4729 */ 4730 4731 static void 4732 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt, 4733 gimple *reduc_def_stmt, 4734 int ncopies, internal_fn reduc_fn, 4735 vec<gimple *> reduction_phis, 4736 bool double_reduc, 4737 slp_tree slp_node, 4738 slp_instance slp_node_instance, 4739 tree induc_val, enum tree_code induc_code, 4740 tree neutral_op) 4741 { 4742 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 4743 stmt_vec_info prev_phi_info; 4744 tree vectype; 4745 machine_mode mode; 4746 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 4747 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL; 4748 basic_block exit_bb; 4749 tree scalar_dest; 4750 tree scalar_type; 4751 gimple *new_phi = NULL, *phi; 4752 gimple_stmt_iterator exit_gsi; 4753 tree vec_dest; 4754 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest; 4755 gimple *epilog_stmt = NULL; 4756 enum tree_code code = gimple_assign_rhs_code (stmt); 4757 gimple *exit_phi; 4758 tree bitsize; 4759 tree adjustment_def = NULL; 4760 tree vec_initial_def = NULL; 4761 tree expr, def, initial_def = NULL; 4762 tree orig_name, scalar_result; 4763 imm_use_iterator imm_iter, phi_imm_iter; 4764 use_operand_p use_p, phi_use_p; 4765 gimple *use_stmt, *orig_stmt, *reduction_phi = NULL; 4766 bool nested_in_vect_loop = false; 4767 auto_vec<gimple *> new_phis; 4768 auto_vec<gimple *> inner_phis; 4769 enum vect_def_type dt = vect_unknown_def_type; 4770 int j, i; 4771 auto_vec<tree> scalar_results; 4772 unsigned int group_size = 1, k, ratio; 4773 auto_vec<tree> vec_initial_defs; 4774 auto_vec<gimple *> phis; 4775 bool slp_reduc = false; 4776 bool direct_slp_reduc; 4777 tree new_phi_result; 4778 gimple *inner_phi = NULL; 4779 tree induction_index = NULL_TREE; 4780 4781 if (slp_node) 4782 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length (); 4783 4784 if (nested_in_vect_loop_p (loop, stmt)) 4785 { 4786 outer_loop = loop; 4787 loop = loop->inner; 4788 nested_in_vect_loop = true; 4789 gcc_assert (!slp_node); 4790 } 4791 4792 vectype = STMT_VINFO_VECTYPE (stmt_info); 4793 gcc_assert (vectype); 4794 mode = TYPE_MODE (vectype); 4795 4796 /* 1. Create the reduction def-use cycle: 4797 Set the arguments of REDUCTION_PHIS, i.e., transform 4798 4799 loop: 4800 vec_def = phi <null, null> # REDUCTION_PHI 4801 VECT_DEF = vector_stmt # vectorized form of STMT 4802 ... 4803 4804 into: 4805 4806 loop: 4807 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI 4808 VECT_DEF = vector_stmt # vectorized form of STMT 4809 ... 4810 4811 (in case of SLP, do it for all the phis). */ 4812 4813 /* Get the loop-entry arguments. */ 4814 enum vect_def_type initial_def_dt = vect_unknown_def_type; 4815 if (slp_node) 4816 { 4817 unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); 4818 vec_initial_defs.reserve (vec_num); 4819 get_initial_defs_for_reduction (slp_node_instance->reduc_phis, 4820 &vec_initial_defs, vec_num, 4821 GROUP_FIRST_ELEMENT (stmt_info), 4822 neutral_op); 4823 } 4824 else 4825 { 4826 /* Get at the scalar def before the loop, that defines the initial value 4827 of the reduction variable. */ 4828 gimple *def_stmt; 4829 initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt, 4830 loop_preheader_edge (loop)); 4831 /* Optimize: if initial_def is for REDUC_MAX smaller than the base 4832 and we can't use zero for induc_val, use initial_def. Similarly 4833 for REDUC_MIN and initial_def larger than the base. */ 4834 if (TREE_CODE (initial_def) == INTEGER_CST 4835 && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) 4836 == INTEGER_INDUC_COND_REDUCTION) 4837 && !integer_zerop (induc_val) 4838 && ((induc_code == MAX_EXPR 4839 && tree_int_cst_lt (initial_def, induc_val)) 4840 || (induc_code == MIN_EXPR 4841 && tree_int_cst_lt (induc_val, initial_def)))) 4842 induc_val = initial_def; 4843 vect_is_simple_use (initial_def, loop_vinfo, &def_stmt, &initial_def_dt); 4844 vec_initial_def = get_initial_def_for_reduction (stmt, initial_def, 4845 &adjustment_def); 4846 vec_initial_defs.create (1); 4847 vec_initial_defs.quick_push (vec_initial_def); 4848 } 4849 4850 /* Set phi nodes arguments. */ 4851 FOR_EACH_VEC_ELT (reduction_phis, i, phi) 4852 { 4853 tree vec_init_def = vec_initial_defs[i]; 4854 tree def = vect_defs[i]; 4855 for (j = 0; j < ncopies; j++) 4856 { 4857 if (j != 0) 4858 { 4859 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)); 4860 if (nested_in_vect_loop) 4861 vec_init_def 4862 = vect_get_vec_def_for_stmt_copy (initial_def_dt, 4863 vec_init_def); 4864 } 4865 4866 /* Set the loop-entry arg of the reduction-phi. */ 4867 4868 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) 4869 == INTEGER_INDUC_COND_REDUCTION) 4870 { 4871 /* Initialise the reduction phi to zero. This prevents initial 4872 values of non-zero interferring with the reduction op. */ 4873 gcc_assert (ncopies == 1); 4874 gcc_assert (i == 0); 4875 4876 tree vec_init_def_type = TREE_TYPE (vec_init_def); 4877 tree induc_val_vec 4878 = build_vector_from_val (vec_init_def_type, induc_val); 4879 4880 add_phi_arg (as_a <gphi *> (phi), induc_val_vec, 4881 loop_preheader_edge (loop), UNKNOWN_LOCATION); 4882 } 4883 else 4884 add_phi_arg (as_a <gphi *> (phi), vec_init_def, 4885 loop_preheader_edge (loop), UNKNOWN_LOCATION); 4886 4887 /* Set the loop-latch arg for the reduction-phi. */ 4888 if (j > 0) 4889 def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def); 4890 4891 add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop), 4892 UNKNOWN_LOCATION); 4893 4894 if (dump_enabled_p ()) 4895 { 4896 dump_printf_loc (MSG_NOTE, vect_location, 4897 "transform reduction: created def-use cycle: "); 4898 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0); 4899 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0); 4900 } 4901 } 4902 } 4903 4904 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR) 4905 which is updated with the current index of the loop for every match of 4906 the original loop's cond_expr (VEC_STMT). This results in a vector 4907 containing the last time the condition passed for that vector lane. 4908 The first match will be a 1 to allow 0 to be used for non-matching 4909 indexes. If there are no matches at all then the vector will be all 4910 zeroes. */ 4911 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION) 4912 { 4913 tree indx_before_incr, indx_after_incr; 4914 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype); 4915 4916 gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info); 4917 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR); 4918 4919 int scalar_precision 4920 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype))); 4921 tree cr_index_scalar_type = make_unsigned_type (scalar_precision); 4922 tree cr_index_vector_type = build_vector_type 4923 (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype)); 4924 4925 /* First we create a simple vector induction variable which starts 4926 with the values {1,2,3,...} (SERIES_VECT) and increments by the 4927 vector size (STEP). */ 4928 4929 /* Create a {1,2,3,...} vector. */ 4930 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1); 4931 4932 /* Create a vector of the step value. */ 4933 tree step = build_int_cst (cr_index_scalar_type, nunits_out); 4934 tree vec_step = build_vector_from_val (cr_index_vector_type, step); 4935 4936 /* Create an induction variable. */ 4937 gimple_stmt_iterator incr_gsi; 4938 bool insert_after; 4939 standard_iv_increment_position (loop, &incr_gsi, &insert_after); 4940 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi, 4941 insert_after, &indx_before_incr, &indx_after_incr); 4942 4943 /* Next create a new phi node vector (NEW_PHI_TREE) which starts 4944 filled with zeros (VEC_ZERO). */ 4945 4946 /* Create a vector of 0s. */ 4947 tree zero = build_zero_cst (cr_index_scalar_type); 4948 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero); 4949 4950 /* Create a vector phi node. */ 4951 tree new_phi_tree = make_ssa_name (cr_index_vector_type); 4952 new_phi = create_phi_node (new_phi_tree, loop->header); 4953 set_vinfo_for_stmt (new_phi, 4954 new_stmt_vec_info (new_phi, loop_vinfo)); 4955 add_phi_arg (as_a <gphi *> (new_phi), vec_zero, 4956 loop_preheader_edge (loop), UNKNOWN_LOCATION); 4957 4958 /* Now take the condition from the loops original cond_expr 4959 (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for 4960 every match uses values from the induction variable 4961 (INDEX_BEFORE_INCR) otherwise uses values from the phi node 4962 (NEW_PHI_TREE). 4963 Finally, we update the phi (NEW_PHI_TREE) to take the value of 4964 the new cond_expr (INDEX_COND_EXPR). */ 4965 4966 /* Duplicate the condition from vec_stmt. */ 4967 tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt)); 4968 4969 /* Create a conditional, where the condition is taken from vec_stmt 4970 (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and 4971 else is the phi (NEW_PHI_TREE). */ 4972 tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type, 4973 ccompare, indx_before_incr, 4974 new_phi_tree); 4975 induction_index = make_ssa_name (cr_index_vector_type); 4976 gimple *index_condition = gimple_build_assign (induction_index, 4977 index_cond_expr); 4978 gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT); 4979 stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition, 4980 loop_vinfo); 4981 STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type; 4982 set_vinfo_for_stmt (index_condition, index_vec_info); 4983 4984 /* Update the phi with the vec cond. */ 4985 add_phi_arg (as_a <gphi *> (new_phi), induction_index, 4986 loop_latch_edge (loop), UNKNOWN_LOCATION); 4987 } 4988 4989 /* 2. Create epilog code. 4990 The reduction epilog code operates across the elements of the vector 4991 of partial results computed by the vectorized loop. 4992 The reduction epilog code consists of: 4993 4994 step 1: compute the scalar result in a vector (v_out2) 4995 step 2: extract the scalar result (s_out3) from the vector (v_out2) 4996 step 3: adjust the scalar result (s_out3) if needed. 4997 4998 Step 1 can be accomplished using one the following three schemes: 4999 (scheme 1) using reduc_fn, if available. 5000 (scheme 2) using whole-vector shifts, if available. 5001 (scheme 3) using a scalar loop. In this case steps 1+2 above are 5002 combined. 5003 5004 The overall epilog code looks like this: 5005 5006 s_out0 = phi <s_loop> # original EXIT_PHI 5007 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI 5008 v_out2 = reduce <v_out1> # step 1 5009 s_out3 = extract_field <v_out2, 0> # step 2 5010 s_out4 = adjust_result <s_out3> # step 3 5011 5012 (step 3 is optional, and steps 1 and 2 may be combined). 5013 Lastly, the uses of s_out0 are replaced by s_out4. */ 5014 5015 5016 /* 2.1 Create new loop-exit-phis to preserve loop-closed form: 5017 v_out1 = phi <VECT_DEF> 5018 Store them in NEW_PHIS. */ 5019 5020 exit_bb = single_exit (loop)->dest; 5021 prev_phi_info = NULL; 5022 new_phis.create (vect_defs.length ()); 5023 FOR_EACH_VEC_ELT (vect_defs, i, def) 5024 { 5025 for (j = 0; j < ncopies; j++) 5026 { 5027 tree new_def = copy_ssa_name (def); 5028 phi = create_phi_node (new_def, exit_bb); 5029 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo)); 5030 if (j == 0) 5031 new_phis.quick_push (phi); 5032 else 5033 { 5034 def = vect_get_vec_def_for_stmt_copy (dt, def); 5035 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi; 5036 } 5037 5038 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def); 5039 prev_phi_info = vinfo_for_stmt (phi); 5040 } 5041 } 5042 5043 /* The epilogue is created for the outer-loop, i.e., for the loop being 5044 vectorized. Create exit phis for the outer loop. */ 5045 if (double_reduc) 5046 { 5047 loop = outer_loop; 5048 exit_bb = single_exit (loop)->dest; 5049 inner_phis.create (vect_defs.length ()); 5050 FOR_EACH_VEC_ELT (new_phis, i, phi) 5051 { 5052 tree new_result = copy_ssa_name (PHI_RESULT (phi)); 5053 gphi *outer_phi = create_phi_node (new_result, exit_bb); 5054 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx, 5055 PHI_RESULT (phi)); 5056 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi, 5057 loop_vinfo)); 5058 inner_phis.quick_push (phi); 5059 new_phis[i] = outer_phi; 5060 prev_phi_info = vinfo_for_stmt (outer_phi); 5061 while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi))) 5062 { 5063 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)); 5064 new_result = copy_ssa_name (PHI_RESULT (phi)); 5065 outer_phi = create_phi_node (new_result, exit_bb); 5066 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx, 5067 PHI_RESULT (phi)); 5068 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi, 5069 loop_vinfo)); 5070 STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi; 5071 prev_phi_info = vinfo_for_stmt (outer_phi); 5072 } 5073 } 5074 } 5075 5076 exit_gsi = gsi_after_labels (exit_bb); 5077 5078 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3 5079 (i.e. when reduc_fn is not available) and in the final adjustment 5080 code (if needed). Also get the original scalar reduction variable as 5081 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it 5082 represents a reduction pattern), the tree-code and scalar-def are 5083 taken from the original stmt that the pattern-stmt (STMT) replaces. 5084 Otherwise (it is a regular reduction) - the tree-code and scalar-def 5085 are taken from STMT. */ 5086 5087 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info); 5088 if (!orig_stmt) 5089 { 5090 /* Regular reduction */ 5091 orig_stmt = stmt; 5092 } 5093 else 5094 { 5095 /* Reduction pattern */ 5096 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt); 5097 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo)); 5098 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt); 5099 } 5100 5101 code = gimple_assign_rhs_code (orig_stmt); 5102 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore, 5103 partial results are added and not subtracted. */ 5104 if (code == MINUS_EXPR) 5105 code = PLUS_EXPR; 5106 5107 scalar_dest = gimple_assign_lhs (orig_stmt); 5108 scalar_type = TREE_TYPE (scalar_dest); 5109 scalar_results.create (group_size); 5110 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL); 5111 bitsize = TYPE_SIZE (scalar_type); 5112 5113 /* In case this is a reduction in an inner-loop while vectorizing an outer 5114 loop - we don't need to extract a single scalar result at the end of the 5115 inner-loop (unless it is double reduction, i.e., the use of reduction is 5116 outside the outer-loop). The final vector of partial results will be used 5117 in the vectorized outer-loop, or reduced to a scalar result at the end of 5118 the outer-loop. */ 5119 if (nested_in_vect_loop && !double_reduc) 5120 goto vect_finalize_reduction; 5121 5122 /* SLP reduction without reduction chain, e.g., 5123 # a1 = phi <a2, a0> 5124 # b1 = phi <b2, b0> 5125 a2 = operation (a1) 5126 b2 = operation (b1) */ 5127 slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))); 5128 5129 /* True if we should implement SLP_REDUC using native reduction operations 5130 instead of scalar operations. */ 5131 direct_slp_reduc = (reduc_fn != IFN_LAST 5132 && slp_reduc 5133 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ()); 5134 5135 /* In case of reduction chain, e.g., 5136 # a1 = phi <a3, a0> 5137 a2 = operation (a1) 5138 a3 = operation (a2), 5139 5140 we may end up with more than one vector result. Here we reduce them to 5141 one vector. */ 5142 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) || direct_slp_reduc) 5143 { 5144 tree first_vect = PHI_RESULT (new_phis[0]); 5145 gassign *new_vec_stmt = NULL; 5146 vec_dest = vect_create_destination_var (scalar_dest, vectype); 5147 for (k = 1; k < new_phis.length (); k++) 5148 { 5149 gimple *next_phi = new_phis[k]; 5150 tree second_vect = PHI_RESULT (next_phi); 5151 tree tem = make_ssa_name (vec_dest, new_vec_stmt); 5152 new_vec_stmt = gimple_build_assign (tem, code, 5153 first_vect, second_vect); 5154 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT); 5155 first_vect = tem; 5156 } 5157 5158 new_phi_result = first_vect; 5159 if (new_vec_stmt) 5160 { 5161 new_phis.truncate (0); 5162 new_phis.safe_push (new_vec_stmt); 5163 } 5164 } 5165 /* Likewise if we couldn't use a single defuse cycle. */ 5166 else if (ncopies > 1) 5167 { 5168 gcc_assert (new_phis.length () == 1); 5169 tree first_vect = PHI_RESULT (new_phis[0]); 5170 gassign *new_vec_stmt = NULL; 5171 vec_dest = vect_create_destination_var (scalar_dest, vectype); 5172 gimple *next_phi = new_phis[0]; 5173 for (int k = 1; k < ncopies; ++k) 5174 { 5175 next_phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next_phi)); 5176 tree second_vect = PHI_RESULT (next_phi); 5177 tree tem = make_ssa_name (vec_dest, new_vec_stmt); 5178 new_vec_stmt = gimple_build_assign (tem, code, 5179 first_vect, second_vect); 5180 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT); 5181 first_vect = tem; 5182 } 5183 new_phi_result = first_vect; 5184 new_phis.truncate (0); 5185 new_phis.safe_push (new_vec_stmt); 5186 } 5187 else 5188 new_phi_result = PHI_RESULT (new_phis[0]); 5189 5190 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION 5191 && reduc_fn != IFN_LAST) 5192 { 5193 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing 5194 various data values where the condition matched and another vector 5195 (INDUCTION_INDEX) containing all the indexes of those matches. We 5196 need to extract the last matching index (which will be the index with 5197 highest value) and use this to index into the data vector. 5198 For the case where there were no matches, the data vector will contain 5199 all default values and the index vector will be all zeros. */ 5200 5201 /* Get various versions of the type of the vector of indexes. */ 5202 tree index_vec_type = TREE_TYPE (induction_index); 5203 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type)); 5204 tree index_scalar_type = TREE_TYPE (index_vec_type); 5205 tree index_vec_cmp_type = build_same_sized_truth_vector_type 5206 (index_vec_type); 5207 5208 /* Get an unsigned integer version of the type of the data vector. */ 5209 int scalar_precision 5210 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type)); 5211 tree scalar_type_unsigned = make_unsigned_type (scalar_precision); 5212 tree vectype_unsigned = build_vector_type 5213 (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype)); 5214 5215 /* First we need to create a vector (ZERO_VEC) of zeros and another 5216 vector (MAX_INDEX_VEC) filled with the last matching index, which we 5217 can create using a MAX reduction and then expanding. 5218 In the case where the loop never made any matches, the max index will 5219 be zero. */ 5220 5221 /* Vector of {0, 0, 0,...}. */ 5222 tree zero_vec = make_ssa_name (vectype); 5223 tree zero_vec_rhs = build_zero_cst (vectype); 5224 gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs); 5225 gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT); 5226 5227 /* Find maximum value from the vector of found indexes. */ 5228 tree max_index = make_ssa_name (index_scalar_type); 5229 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX, 5230 1, induction_index); 5231 gimple_call_set_lhs (max_index_stmt, max_index); 5232 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT); 5233 5234 /* Vector of {max_index, max_index, max_index,...}. */ 5235 tree max_index_vec = make_ssa_name (index_vec_type); 5236 tree max_index_vec_rhs = build_vector_from_val (index_vec_type, 5237 max_index); 5238 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec, 5239 max_index_vec_rhs); 5240 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT); 5241 5242 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes 5243 with the vector (INDUCTION_INDEX) of found indexes, choosing values 5244 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC) 5245 otherwise. Only one value should match, resulting in a vector 5246 (VEC_COND) with one data value and the rest zeros. 5247 In the case where the loop never made any matches, every index will 5248 match, resulting in a vector with all data values (which will all be 5249 the default value). */ 5250 5251 /* Compare the max index vector to the vector of found indexes to find 5252 the position of the max value. */ 5253 tree vec_compare = make_ssa_name (index_vec_cmp_type); 5254 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR, 5255 induction_index, 5256 max_index_vec); 5257 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT); 5258 5259 /* Use the compare to choose either values from the data vector or 5260 zero. */ 5261 tree vec_cond = make_ssa_name (vectype); 5262 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR, 5263 vec_compare, new_phi_result, 5264 zero_vec); 5265 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT); 5266 5267 /* Finally we need to extract the data value from the vector (VEC_COND) 5268 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR 5269 reduction, but because this doesn't exist, we can use a MAX reduction 5270 instead. The data value might be signed or a float so we need to cast 5271 it first. 5272 In the case where the loop never made any matches, the data values are 5273 all identical, and so will reduce down correctly. */ 5274 5275 /* Make the matched data values unsigned. */ 5276 tree vec_cond_cast = make_ssa_name (vectype_unsigned); 5277 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned, 5278 vec_cond); 5279 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast, 5280 VIEW_CONVERT_EXPR, 5281 vec_cond_cast_rhs); 5282 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT); 5283 5284 /* Reduce down to a scalar value. */ 5285 tree data_reduc = make_ssa_name (scalar_type_unsigned); 5286 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX, 5287 1, vec_cond_cast); 5288 gimple_call_set_lhs (data_reduc_stmt, data_reduc); 5289 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT); 5290 5291 /* Convert the reduced value back to the result type and set as the 5292 result. */ 5293 gimple_seq stmts = NULL; 5294 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type, 5295 data_reduc); 5296 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); 5297 scalar_results.safe_push (new_temp); 5298 } 5299 else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION 5300 && reduc_fn == IFN_LAST) 5301 { 5302 /* Condition reduction without supported IFN_REDUC_MAX. Generate 5303 idx = 0; 5304 idx_val = induction_index[0]; 5305 val = data_reduc[0]; 5306 for (idx = 0, val = init, i = 0; i < nelts; ++i) 5307 if (induction_index[i] > idx_val) 5308 val = data_reduc[i], idx_val = induction_index[i]; 5309 return val; */ 5310 5311 tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result)); 5312 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index)); 5313 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype)); 5314 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index)); 5315 /* Enforced by vectorizable_reduction, which ensures we have target 5316 support before allowing a conditional reduction on variable-length 5317 vectors. */ 5318 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant (); 5319 tree idx_val = NULL_TREE, val = NULL_TREE; 5320 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size) 5321 { 5322 tree old_idx_val = idx_val; 5323 tree old_val = val; 5324 idx_val = make_ssa_name (idx_eltype); 5325 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF, 5326 build3 (BIT_FIELD_REF, idx_eltype, 5327 induction_index, 5328 bitsize_int (el_size), 5329 bitsize_int (off))); 5330 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5331 val = make_ssa_name (data_eltype); 5332 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF, 5333 build3 (BIT_FIELD_REF, 5334 data_eltype, 5335 new_phi_result, 5336 bitsize_int (el_size), 5337 bitsize_int (off))); 5338 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5339 if (off != 0) 5340 { 5341 tree new_idx_val = idx_val; 5342 tree new_val = val; 5343 if (off != v_size - el_size) 5344 { 5345 new_idx_val = make_ssa_name (idx_eltype); 5346 epilog_stmt = gimple_build_assign (new_idx_val, 5347 MAX_EXPR, idx_val, 5348 old_idx_val); 5349 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5350 } 5351 new_val = make_ssa_name (data_eltype); 5352 epilog_stmt = gimple_build_assign (new_val, 5353 COND_EXPR, 5354 build2 (GT_EXPR, 5355 boolean_type_node, 5356 idx_val, 5357 old_idx_val), 5358 val, old_val); 5359 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5360 idx_val = new_idx_val; 5361 val = new_val; 5362 } 5363 } 5364 /* Convert the reduced value back to the result type and set as the 5365 result. */ 5366 gimple_seq stmts = NULL; 5367 val = gimple_convert (&stmts, scalar_type, val); 5368 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); 5369 scalar_results.safe_push (val); 5370 } 5371 5372 /* 2.3 Create the reduction code, using one of the three schemes described 5373 above. In SLP we simply need to extract all the elements from the 5374 vector (without reducing them), so we use scalar shifts. */ 5375 else if (reduc_fn != IFN_LAST && !slp_reduc) 5376 { 5377 tree tmp; 5378 tree vec_elem_type; 5379 5380 /* Case 1: Create: 5381 v_out2 = reduc_expr <v_out1> */ 5382 5383 if (dump_enabled_p ()) 5384 dump_printf_loc (MSG_NOTE, vect_location, 5385 "Reduce using direct vector reduction.\n"); 5386 5387 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result)); 5388 if (!useless_type_conversion_p (scalar_type, vec_elem_type)) 5389 { 5390 tree tmp_dest 5391 = vect_create_destination_var (scalar_dest, vec_elem_type); 5392 epilog_stmt = gimple_build_call_internal (reduc_fn, 1, 5393 new_phi_result); 5394 gimple_set_lhs (epilog_stmt, tmp_dest); 5395 new_temp = make_ssa_name (tmp_dest, epilog_stmt); 5396 gimple_set_lhs (epilog_stmt, new_temp); 5397 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5398 5399 epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR, 5400 new_temp); 5401 } 5402 else 5403 { 5404 epilog_stmt = gimple_build_call_internal (reduc_fn, 1, 5405 new_phi_result); 5406 gimple_set_lhs (epilog_stmt, new_scalar_dest); 5407 } 5408 5409 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); 5410 gimple_set_lhs (epilog_stmt, new_temp); 5411 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5412 5413 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) 5414 == INTEGER_INDUC_COND_REDUCTION) 5415 && !operand_equal_p (initial_def, induc_val, 0)) 5416 { 5417 /* Earlier we set the initial value to be a vector if induc_val 5418 values. Check the result and if it is induc_val then replace 5419 with the original initial value, unless induc_val is 5420 the same as initial_def already. */ 5421 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp, 5422 induc_val); 5423 5424 tmp = make_ssa_name (new_scalar_dest); 5425 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare, 5426 initial_def, new_temp); 5427 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5428 new_temp = tmp; 5429 } 5430 5431 scalar_results.safe_push (new_temp); 5432 } 5433 else if (direct_slp_reduc) 5434 { 5435 /* Here we create one vector for each of the GROUP_SIZE results, 5436 with the elements for other SLP statements replaced with the 5437 neutral value. We can then do a normal reduction on each vector. */ 5438 5439 /* Enforced by vectorizable_reduction. */ 5440 gcc_assert (new_phis.length () == 1); 5441 gcc_assert (pow2p_hwi (group_size)); 5442 5443 slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis; 5444 vec<gimple *> orig_phis = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node); 5445 gimple_seq seq = NULL; 5446 5447 /* Build a vector {0, 1, 2, ...}, with the same number of elements 5448 and the same element size as VECTYPE. */ 5449 tree index = build_index_vector (vectype, 0, 1); 5450 tree index_type = TREE_TYPE (index); 5451 tree index_elt_type = TREE_TYPE (index_type); 5452 tree mask_type = build_same_sized_truth_vector_type (index_type); 5453 5454 /* Create a vector that, for each element, identifies which of 5455 the GROUP_SIZE results should use it. */ 5456 tree index_mask = build_int_cst (index_elt_type, group_size - 1); 5457 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index, 5458 build_vector_from_val (index_type, index_mask)); 5459 5460 /* Get a neutral vector value. This is simply a splat of the neutral 5461 scalar value if we have one, otherwise the initial scalar value 5462 is itself a neutral value. */ 5463 tree vector_identity = NULL_TREE; 5464 if (neutral_op) 5465 vector_identity = gimple_build_vector_from_val (&seq, vectype, 5466 neutral_op); 5467 for (unsigned int i = 0; i < group_size; ++i) 5468 { 5469 /* If there's no univeral neutral value, we can use the 5470 initial scalar value from the original PHI. This is used 5471 for MIN and MAX reduction, for example. */ 5472 if (!neutral_op) 5473 { 5474 tree scalar_value 5475 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i], 5476 loop_preheader_edge (loop)); 5477 vector_identity = gimple_build_vector_from_val (&seq, vectype, 5478 scalar_value); 5479 } 5480 5481 /* Calculate the equivalent of: 5482 5483 sel[j] = (index[j] == i); 5484 5485 which selects the elements of NEW_PHI_RESULT that should 5486 be included in the result. */ 5487 tree compare_val = build_int_cst (index_elt_type, i); 5488 compare_val = build_vector_from_val (index_type, compare_val); 5489 tree sel = gimple_build (&seq, EQ_EXPR, mask_type, 5490 index, compare_val); 5491 5492 /* Calculate the equivalent of: 5493 5494 vec = seq ? new_phi_result : vector_identity; 5495 5496 VEC is now suitable for a full vector reduction. */ 5497 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype, 5498 sel, new_phi_result, vector_identity); 5499 5500 /* Do the reduction and convert it to the appropriate type. */ 5501 gcall *call = gimple_build_call_internal (reduc_fn, 1, vec); 5502 tree scalar = make_ssa_name (TREE_TYPE (vectype)); 5503 gimple_call_set_lhs (call, scalar); 5504 gimple_seq_add_stmt (&seq, call); 5505 scalar = gimple_convert (&seq, scalar_type, scalar); 5506 scalar_results.safe_push (scalar); 5507 } 5508 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT); 5509 } 5510 else 5511 { 5512 bool reduce_with_shift; 5513 tree vec_temp; 5514 5515 /* COND reductions all do the final reduction with MAX_EXPR 5516 or MIN_EXPR. */ 5517 if (code == COND_EXPR) 5518 { 5519 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) 5520 == INTEGER_INDUC_COND_REDUCTION) 5521 code = induc_code; 5522 else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) 5523 == CONST_COND_REDUCTION) 5524 code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info); 5525 else 5526 code = MAX_EXPR; 5527 } 5528 5529 /* See if the target wants to do the final (shift) reduction 5530 in a vector mode of smaller size and first reduce upper/lower 5531 halves against each other. */ 5532 enum machine_mode mode1 = mode; 5533 tree vectype1 = vectype; 5534 unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype)); 5535 unsigned sz1 = sz; 5536 if (!slp_reduc 5537 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode) 5538 sz1 = GET_MODE_SIZE (mode1).to_constant (); 5539 5540 vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1); 5541 reduce_with_shift = have_whole_vector_shift (mode1); 5542 if (!VECTOR_MODE_P (mode1)) 5543 reduce_with_shift = false; 5544 else 5545 { 5546 optab optab = optab_for_tree_code (code, vectype1, optab_default); 5547 if (optab_handler (optab, mode1) == CODE_FOR_nothing) 5548 reduce_with_shift = false; 5549 } 5550 5551 /* First reduce the vector to the desired vector size we should 5552 do shift reduction on by combining upper and lower halves. */ 5553 new_temp = new_phi_result; 5554 while (sz > sz1) 5555 { 5556 gcc_assert (!slp_reduc); 5557 sz /= 2; 5558 vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz); 5559 5560 /* The target has to make sure we support lowpart/highpart 5561 extraction, either via direct vector extract or through 5562 an integer mode punning. */ 5563 tree dst1, dst2; 5564 if (convert_optab_handler (vec_extract_optab, 5565 TYPE_MODE (TREE_TYPE (new_temp)), 5566 TYPE_MODE (vectype1)) 5567 != CODE_FOR_nothing) 5568 { 5569 /* Extract sub-vectors directly once vec_extract becomes 5570 a conversion optab. */ 5571 dst1 = make_ssa_name (vectype1); 5572 epilog_stmt 5573 = gimple_build_assign (dst1, BIT_FIELD_REF, 5574 build3 (BIT_FIELD_REF, vectype1, 5575 new_temp, TYPE_SIZE (vectype1), 5576 bitsize_int (0))); 5577 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5578 dst2 = make_ssa_name (vectype1); 5579 epilog_stmt 5580 = gimple_build_assign (dst2, BIT_FIELD_REF, 5581 build3 (BIT_FIELD_REF, vectype1, 5582 new_temp, TYPE_SIZE (vectype1), 5583 bitsize_int (sz * BITS_PER_UNIT))); 5584 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5585 } 5586 else 5587 { 5588 /* Extract via punning to appropriately sized integer mode 5589 vector. */ 5590 tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT, 5591 1); 5592 tree etype = build_vector_type (eltype, 2); 5593 gcc_assert (convert_optab_handler (vec_extract_optab, 5594 TYPE_MODE (etype), 5595 TYPE_MODE (eltype)) 5596 != CODE_FOR_nothing); 5597 tree tem = make_ssa_name (etype); 5598 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR, 5599 build1 (VIEW_CONVERT_EXPR, 5600 etype, new_temp)); 5601 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5602 new_temp = tem; 5603 tem = make_ssa_name (eltype); 5604 epilog_stmt 5605 = gimple_build_assign (tem, BIT_FIELD_REF, 5606 build3 (BIT_FIELD_REF, eltype, 5607 new_temp, TYPE_SIZE (eltype), 5608 bitsize_int (0))); 5609 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5610 dst1 = make_ssa_name (vectype1); 5611 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR, 5612 build1 (VIEW_CONVERT_EXPR, 5613 vectype1, tem)); 5614 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5615 tem = make_ssa_name (eltype); 5616 epilog_stmt 5617 = gimple_build_assign (tem, BIT_FIELD_REF, 5618 build3 (BIT_FIELD_REF, eltype, 5619 new_temp, TYPE_SIZE (eltype), 5620 bitsize_int (sz * BITS_PER_UNIT))); 5621 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5622 dst2 = make_ssa_name (vectype1); 5623 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR, 5624 build1 (VIEW_CONVERT_EXPR, 5625 vectype1, tem)); 5626 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5627 } 5628 5629 new_temp = make_ssa_name (vectype1); 5630 epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2); 5631 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5632 } 5633 5634 if (reduce_with_shift && !slp_reduc) 5635 { 5636 int element_bitsize = tree_to_uhwi (bitsize); 5637 /* Enforced by vectorizable_reduction, which disallows SLP reductions 5638 for variable-length vectors and also requires direct target support 5639 for loop reductions. */ 5640 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1)); 5641 int nelements = vec_size_in_bits / element_bitsize; 5642 vec_perm_builder sel; 5643 vec_perm_indices indices; 5644 5645 int elt_offset; 5646 5647 tree zero_vec = build_zero_cst (vectype1); 5648 /* Case 2: Create: 5649 for (offset = nelements/2; offset >= 1; offset/=2) 5650 { 5651 Create: va' = vec_shift <va, offset> 5652 Create: va = vop <va, va'> 5653 } */ 5654 5655 tree rhs; 5656 5657 if (dump_enabled_p ()) 5658 dump_printf_loc (MSG_NOTE, vect_location, 5659 "Reduce using vector shifts\n"); 5660 5661 mode1 = TYPE_MODE (vectype1); 5662 vec_dest = vect_create_destination_var (scalar_dest, vectype1); 5663 for (elt_offset = nelements / 2; 5664 elt_offset >= 1; 5665 elt_offset /= 2) 5666 { 5667 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel); 5668 indices.new_vector (sel, 2, nelements); 5669 tree mask = vect_gen_perm_mask_any (vectype1, indices); 5670 epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR, 5671 new_temp, zero_vec, mask); 5672 new_name = make_ssa_name (vec_dest, epilog_stmt); 5673 gimple_assign_set_lhs (epilog_stmt, new_name); 5674 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5675 5676 epilog_stmt = gimple_build_assign (vec_dest, code, new_name, 5677 new_temp); 5678 new_temp = make_ssa_name (vec_dest, epilog_stmt); 5679 gimple_assign_set_lhs (epilog_stmt, new_temp); 5680 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5681 } 5682 5683 /* 2.4 Extract the final scalar result. Create: 5684 s_out3 = extract_field <v_out2, bitpos> */ 5685 5686 if (dump_enabled_p ()) 5687 dump_printf_loc (MSG_NOTE, vect_location, 5688 "extract scalar result\n"); 5689 5690 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, 5691 bitsize, bitsize_zero_node); 5692 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs); 5693 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); 5694 gimple_assign_set_lhs (epilog_stmt, new_temp); 5695 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5696 scalar_results.safe_push (new_temp); 5697 } 5698 else 5699 { 5700 /* Case 3: Create: 5701 s = extract_field <v_out2, 0> 5702 for (offset = element_size; 5703 offset < vector_size; 5704 offset += element_size;) 5705 { 5706 Create: s' = extract_field <v_out2, offset> 5707 Create: s = op <s, s'> // For non SLP cases 5708 } */ 5709 5710 if (dump_enabled_p ()) 5711 dump_printf_loc (MSG_NOTE, vect_location, 5712 "Reduce using scalar code.\n"); 5713 5714 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1)); 5715 int element_bitsize = tree_to_uhwi (bitsize); 5716 FOR_EACH_VEC_ELT (new_phis, i, new_phi) 5717 { 5718 int bit_offset; 5719 if (gimple_code (new_phi) == GIMPLE_PHI) 5720 vec_temp = PHI_RESULT (new_phi); 5721 else 5722 vec_temp = gimple_assign_lhs (new_phi); 5723 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize, 5724 bitsize_zero_node); 5725 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs); 5726 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); 5727 gimple_assign_set_lhs (epilog_stmt, new_temp); 5728 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5729 5730 /* In SLP we don't need to apply reduction operation, so we just 5731 collect s' values in SCALAR_RESULTS. */ 5732 if (slp_reduc) 5733 scalar_results.safe_push (new_temp); 5734 5735 for (bit_offset = element_bitsize; 5736 bit_offset < vec_size_in_bits; 5737 bit_offset += element_bitsize) 5738 { 5739 tree bitpos = bitsize_int (bit_offset); 5740 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, 5741 bitsize, bitpos); 5742 5743 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs); 5744 new_name = make_ssa_name (new_scalar_dest, epilog_stmt); 5745 gimple_assign_set_lhs (epilog_stmt, new_name); 5746 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5747 5748 if (slp_reduc) 5749 { 5750 /* In SLP we don't need to apply reduction operation, so 5751 we just collect s' values in SCALAR_RESULTS. */ 5752 new_temp = new_name; 5753 scalar_results.safe_push (new_name); 5754 } 5755 else 5756 { 5757 epilog_stmt = gimple_build_assign (new_scalar_dest, code, 5758 new_name, new_temp); 5759 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); 5760 gimple_assign_set_lhs (epilog_stmt, new_temp); 5761 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5762 } 5763 } 5764 } 5765 5766 /* The only case where we need to reduce scalar results in SLP, is 5767 unrolling. If the size of SCALAR_RESULTS is greater than 5768 GROUP_SIZE, we reduce them combining elements modulo 5769 GROUP_SIZE. */ 5770 if (slp_reduc) 5771 { 5772 tree res, first_res, new_res; 5773 gimple *new_stmt; 5774 5775 /* Reduce multiple scalar results in case of SLP unrolling. */ 5776 for (j = group_size; scalar_results.iterate (j, &res); 5777 j++) 5778 { 5779 first_res = scalar_results[j % group_size]; 5780 new_stmt = gimple_build_assign (new_scalar_dest, code, 5781 first_res, res); 5782 new_res = make_ssa_name (new_scalar_dest, new_stmt); 5783 gimple_assign_set_lhs (new_stmt, new_res); 5784 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT); 5785 scalar_results[j % group_size] = new_res; 5786 } 5787 } 5788 else 5789 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */ 5790 scalar_results.safe_push (new_temp); 5791 } 5792 5793 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) 5794 == INTEGER_INDUC_COND_REDUCTION) 5795 && !operand_equal_p (initial_def, induc_val, 0)) 5796 { 5797 /* Earlier we set the initial value to be a vector if induc_val 5798 values. Check the result and if it is induc_val then replace 5799 with the original initial value, unless induc_val is 5800 the same as initial_def already. */ 5801 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp, 5802 induc_val); 5803 5804 tree tmp = make_ssa_name (new_scalar_dest); 5805 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare, 5806 initial_def, new_temp); 5807 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5808 scalar_results[0] = tmp; 5809 } 5810 } 5811 5812 vect_finalize_reduction: 5813 5814 if (double_reduc) 5815 loop = loop->inner; 5816 5817 /* 2.5 Adjust the final result by the initial value of the reduction 5818 variable. (When such adjustment is not needed, then 5819 'adjustment_def' is zero). For example, if code is PLUS we create: 5820 new_temp = loop_exit_def + adjustment_def */ 5821 5822 if (adjustment_def) 5823 { 5824 gcc_assert (!slp_reduc); 5825 if (nested_in_vect_loop) 5826 { 5827 new_phi = new_phis[0]; 5828 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE); 5829 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def); 5830 new_dest = vect_create_destination_var (scalar_dest, vectype); 5831 } 5832 else 5833 { 5834 new_temp = scalar_results[0]; 5835 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE); 5836 expr = build2 (code, scalar_type, new_temp, adjustment_def); 5837 new_dest = vect_create_destination_var (scalar_dest, scalar_type); 5838 } 5839 5840 epilog_stmt = gimple_build_assign (new_dest, expr); 5841 new_temp = make_ssa_name (new_dest, epilog_stmt); 5842 gimple_assign_set_lhs (epilog_stmt, new_temp); 5843 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5844 if (nested_in_vect_loop) 5845 { 5846 set_vinfo_for_stmt (epilog_stmt, 5847 new_stmt_vec_info (epilog_stmt, loop_vinfo)); 5848 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) = 5849 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi)); 5850 5851 if (!double_reduc) 5852 scalar_results.quick_push (new_temp); 5853 else 5854 scalar_results[0] = new_temp; 5855 } 5856 else 5857 scalar_results[0] = new_temp; 5858 5859 new_phis[0] = epilog_stmt; 5860 } 5861 5862 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit 5863 phis with new adjusted scalar results, i.e., replace use <s_out0> 5864 with use <s_out4>. 5865 5866 Transform: 5867 loop_exit: 5868 s_out0 = phi <s_loop> # (scalar) EXIT_PHI 5869 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI 5870 v_out2 = reduce <v_out1> 5871 s_out3 = extract_field <v_out2, 0> 5872 s_out4 = adjust_result <s_out3> 5873 use <s_out0> 5874 use <s_out0> 5875 5876 into: 5877 5878 loop_exit: 5879 s_out0 = phi <s_loop> # (scalar) EXIT_PHI 5880 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI 5881 v_out2 = reduce <v_out1> 5882 s_out3 = extract_field <v_out2, 0> 5883 s_out4 = adjust_result <s_out3> 5884 use <s_out4> 5885 use <s_out4> */ 5886 5887 5888 /* In SLP reduction chain we reduce vector results into one vector if 5889 necessary, hence we set here GROUP_SIZE to 1. SCALAR_DEST is the LHS of 5890 the last stmt in the reduction chain, since we are looking for the loop 5891 exit phi node. */ 5892 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))) 5893 { 5894 gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]; 5895 /* Handle reduction patterns. */ 5896 if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt))) 5897 dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)); 5898 5899 scalar_dest = gimple_assign_lhs (dest_stmt); 5900 group_size = 1; 5901 } 5902 5903 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in 5904 case that GROUP_SIZE is greater than vectorization factor). Therefore, we 5905 need to match SCALAR_RESULTS with corresponding statements. The first 5906 (GROUP_SIZE / number of new vector stmts) scalar results correspond to 5907 the first vector stmt, etc. 5908 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */ 5909 if (group_size > new_phis.length ()) 5910 { 5911 ratio = group_size / new_phis.length (); 5912 gcc_assert (!(group_size % new_phis.length ())); 5913 } 5914 else 5915 ratio = 1; 5916 5917 for (k = 0; k < group_size; k++) 5918 { 5919 if (k % ratio == 0) 5920 { 5921 epilog_stmt = new_phis[k / ratio]; 5922 reduction_phi = reduction_phis[k / ratio]; 5923 if (double_reduc) 5924 inner_phi = inner_phis[k / ratio]; 5925 } 5926 5927 if (slp_reduc) 5928 { 5929 gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k]; 5930 5931 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt)); 5932 /* SLP statements can't participate in patterns. */ 5933 gcc_assert (!orig_stmt); 5934 scalar_dest = gimple_assign_lhs (current_stmt); 5935 } 5936 5937 phis.create (3); 5938 /* Find the loop-closed-use at the loop exit of the original scalar 5939 result. (The reduction result is expected to have two immediate uses - 5940 one at the latch block, and one at the loop exit). */ 5941 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest) 5942 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))) 5943 && !is_gimple_debug (USE_STMT (use_p))) 5944 phis.safe_push (USE_STMT (use_p)); 5945 5946 /* While we expect to have found an exit_phi because of loop-closed-ssa 5947 form we can end up without one if the scalar cycle is dead. */ 5948 5949 FOR_EACH_VEC_ELT (phis, i, exit_phi) 5950 { 5951 if (outer_loop) 5952 { 5953 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi); 5954 gphi *vect_phi; 5955 5956 /* FORNOW. Currently not supporting the case that an inner-loop 5957 reduction is not used in the outer-loop (but only outside the 5958 outer-loop), unless it is double reduction. */ 5959 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo) 5960 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)) 5961 || double_reduc); 5962 5963 if (double_reduc) 5964 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi; 5965 else 5966 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt; 5967 if (!double_reduc 5968 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo) 5969 != vect_double_reduction_def) 5970 continue; 5971 5972 /* Handle double reduction: 5973 5974 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop) 5975 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop) 5976 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop) 5977 stmt4: s2 = phi <s4> - double reduction stmt (outer loop) 5978 5979 At that point the regular reduction (stmt2 and stmt3) is 5980 already vectorized, as well as the exit phi node, stmt4. 5981 Here we vectorize the phi node of double reduction, stmt1, and 5982 update all relevant statements. */ 5983 5984 /* Go through all the uses of s2 to find double reduction phi 5985 node, i.e., stmt1 above. */ 5986 orig_name = PHI_RESULT (exit_phi); 5987 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name) 5988 { 5989 stmt_vec_info use_stmt_vinfo; 5990 stmt_vec_info new_phi_vinfo; 5991 tree vect_phi_init, preheader_arg, vect_phi_res; 5992 basic_block bb = gimple_bb (use_stmt); 5993 gimple *use; 5994 5995 /* Check that USE_STMT is really double reduction phi 5996 node. */ 5997 if (gimple_code (use_stmt) != GIMPLE_PHI 5998 || gimple_phi_num_args (use_stmt) != 2 5999 || bb->loop_father != outer_loop) 6000 continue; 6001 use_stmt_vinfo = vinfo_for_stmt (use_stmt); 6002 if (!use_stmt_vinfo 6003 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo) 6004 != vect_double_reduction_def) 6005 continue; 6006 6007 /* Create vector phi node for double reduction: 6008 vs1 = phi <vs0, vs2> 6009 vs1 was created previously in this function by a call to 6010 vect_get_vec_def_for_operand and is stored in 6011 vec_initial_def; 6012 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI; 6013 vs0 is created here. */ 6014 6015 /* Create vector phi node. */ 6016 vect_phi = create_phi_node (vec_initial_def, bb); 6017 new_phi_vinfo = new_stmt_vec_info (vect_phi, 6018 loop_vec_info_for_loop (outer_loop)); 6019 set_vinfo_for_stmt (vect_phi, new_phi_vinfo); 6020 6021 /* Create vs0 - initial def of the double reduction phi. */ 6022 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt, 6023 loop_preheader_edge (outer_loop)); 6024 vect_phi_init = get_initial_def_for_reduction 6025 (stmt, preheader_arg, NULL); 6026 6027 /* Update phi node arguments with vs0 and vs2. */ 6028 add_phi_arg (vect_phi, vect_phi_init, 6029 loop_preheader_edge (outer_loop), 6030 UNKNOWN_LOCATION); 6031 add_phi_arg (vect_phi, PHI_RESULT (inner_phi), 6032 loop_latch_edge (outer_loop), UNKNOWN_LOCATION); 6033 if (dump_enabled_p ()) 6034 { 6035 dump_printf_loc (MSG_NOTE, vect_location, 6036 "created double reduction phi node: "); 6037 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0); 6038 } 6039 6040 vect_phi_res = PHI_RESULT (vect_phi); 6041 6042 /* Replace the use, i.e., set the correct vs1 in the regular 6043 reduction phi node. FORNOW, NCOPIES is always 1, so the 6044 loop is redundant. */ 6045 use = reduction_phi; 6046 for (j = 0; j < ncopies; j++) 6047 { 6048 edge pr_edge = loop_preheader_edge (loop); 6049 SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res); 6050 use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use)); 6051 } 6052 } 6053 } 6054 } 6055 6056 phis.release (); 6057 if (nested_in_vect_loop) 6058 { 6059 if (double_reduc) 6060 loop = outer_loop; 6061 else 6062 continue; 6063 } 6064 6065 phis.create (3); 6066 /* Find the loop-closed-use at the loop exit of the original scalar 6067 result. (The reduction result is expected to have two immediate uses, 6068 one at the latch block, and one at the loop exit). For double 6069 reductions we are looking for exit phis of the outer loop. */ 6070 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest) 6071 { 6072 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))) 6073 { 6074 if (!is_gimple_debug (USE_STMT (use_p))) 6075 phis.safe_push (USE_STMT (use_p)); 6076 } 6077 else 6078 { 6079 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI) 6080 { 6081 tree phi_res = PHI_RESULT (USE_STMT (use_p)); 6082 6083 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res) 6084 { 6085 if (!flow_bb_inside_loop_p (loop, 6086 gimple_bb (USE_STMT (phi_use_p))) 6087 && !is_gimple_debug (USE_STMT (phi_use_p))) 6088 phis.safe_push (USE_STMT (phi_use_p)); 6089 } 6090 } 6091 } 6092 } 6093 6094 FOR_EACH_VEC_ELT (phis, i, exit_phi) 6095 { 6096 /* Replace the uses: */ 6097 orig_name = PHI_RESULT (exit_phi); 6098 scalar_result = scalar_results[k]; 6099 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name) 6100 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter) 6101 SET_USE (use_p, scalar_result); 6102 } 6103 6104 phis.release (); 6105 } 6106 } 6107 6108 /* Return a vector of type VECTYPE that is equal to the vector select 6109 operation "MASK ? VEC : IDENTITY". Insert the select statements 6110 before GSI. */ 6111 6112 static tree 6113 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype, 6114 tree vec, tree identity) 6115 { 6116 tree cond = make_temp_ssa_name (vectype, NULL, "cond"); 6117 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR, 6118 mask, vec, identity); 6119 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); 6120 return cond; 6121 } 6122 6123 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right 6124 order, starting with LHS. Insert the extraction statements before GSI and 6125 associate the new scalar SSA names with variable SCALAR_DEST. 6126 Return the SSA name for the result. */ 6127 6128 static tree 6129 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest, 6130 tree_code code, tree lhs, tree vector_rhs) 6131 { 6132 tree vectype = TREE_TYPE (vector_rhs); 6133 tree scalar_type = TREE_TYPE (vectype); 6134 tree bitsize = TYPE_SIZE (scalar_type); 6135 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype)); 6136 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize); 6137 6138 for (unsigned HOST_WIDE_INT bit_offset = 0; 6139 bit_offset < vec_size_in_bits; 6140 bit_offset += element_bitsize) 6141 { 6142 tree bitpos = bitsize_int (bit_offset); 6143 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs, 6144 bitsize, bitpos); 6145 6146 gassign *stmt = gimple_build_assign (scalar_dest, rhs); 6147 rhs = make_ssa_name (scalar_dest, stmt); 6148 gimple_assign_set_lhs (stmt, rhs); 6149 gsi_insert_before (gsi, stmt, GSI_SAME_STMT); 6150 6151 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs); 6152 tree new_name = make_ssa_name (scalar_dest, stmt); 6153 gimple_assign_set_lhs (stmt, new_name); 6154 gsi_insert_before (gsi, stmt, GSI_SAME_STMT); 6155 lhs = new_name; 6156 } 6157 return lhs; 6158 } 6159 6160 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT is the 6161 statement that sets the live-out value. REDUC_DEF_STMT is the phi 6162 statement. CODE is the operation performed by STMT and OPS are 6163 its scalar operands. REDUC_INDEX is the index of the operand in 6164 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that 6165 implements in-order reduction, or IFN_LAST if we should open-code it. 6166 VECTYPE_IN is the type of the vector input. MASKS specifies the masks 6167 that should be used to control the operation in a fully-masked loop. */ 6168 6169 static bool 6170 vectorize_fold_left_reduction (gimple *stmt, gimple_stmt_iterator *gsi, 6171 gimple **vec_stmt, slp_tree slp_node, 6172 gimple *reduc_def_stmt, 6173 tree_code code, internal_fn reduc_fn, 6174 tree ops[3], tree vectype_in, 6175 int reduc_index, vec_loop_masks *masks) 6176 { 6177 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 6178 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 6179 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 6180 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info); 6181 gimple *new_stmt = NULL; 6182 6183 int ncopies; 6184 if (slp_node) 6185 ncopies = 1; 6186 else 6187 ncopies = vect_get_num_copies (loop_vinfo, vectype_in); 6188 6189 gcc_assert (!nested_in_vect_loop_p (loop, stmt)); 6190 gcc_assert (ncopies == 1); 6191 gcc_assert (TREE_CODE_LENGTH (code) == binary_op); 6192 gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1)); 6193 gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) 6194 == FOLD_LEFT_REDUCTION); 6195 6196 if (slp_node) 6197 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out), 6198 TYPE_VECTOR_SUBPARTS (vectype_in))); 6199 6200 tree op0 = ops[1 - reduc_index]; 6201 6202 int group_size = 1; 6203 gimple *scalar_dest_def; 6204 auto_vec<tree> vec_oprnds0; 6205 if (slp_node) 6206 { 6207 vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL, slp_node); 6208 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length (); 6209 scalar_dest_def = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]; 6210 } 6211 else 6212 { 6213 tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt); 6214 vec_oprnds0.create (1); 6215 vec_oprnds0.quick_push (loop_vec_def0); 6216 scalar_dest_def = stmt; 6217 } 6218 6219 tree scalar_dest = gimple_assign_lhs (scalar_dest_def); 6220 tree scalar_type = TREE_TYPE (scalar_dest); 6221 tree reduc_var = gimple_phi_result (reduc_def_stmt); 6222 6223 int vec_num = vec_oprnds0.length (); 6224 gcc_assert (vec_num == 1 || slp_node); 6225 tree vec_elem_type = TREE_TYPE (vectype_out); 6226 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type)); 6227 6228 tree vector_identity = NULL_TREE; 6229 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) 6230 vector_identity = build_zero_cst (vectype_out); 6231 6232 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL); 6233 int i; 6234 tree def0; 6235 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0) 6236 { 6237 tree mask = NULL_TREE; 6238 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) 6239 mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i); 6240 6241 /* Handle MINUS by adding the negative. */ 6242 if (reduc_fn != IFN_LAST && code == MINUS_EXPR) 6243 { 6244 tree negated = make_ssa_name (vectype_out); 6245 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0); 6246 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); 6247 def0 = negated; 6248 } 6249 6250 if (mask) 6251 def0 = merge_with_identity (gsi, mask, vectype_out, def0, 6252 vector_identity); 6253 6254 /* On the first iteration the input is simply the scalar phi 6255 result, and for subsequent iterations it is the output of 6256 the preceding operation. */ 6257 if (reduc_fn != IFN_LAST) 6258 { 6259 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0); 6260 /* For chained SLP reductions the output of the previous reduction 6261 operation serves as the input of the next. For the final statement 6262 the output cannot be a temporary - we reuse the original 6263 scalar destination of the last statement. */ 6264 if (i != vec_num - 1) 6265 { 6266 gimple_set_lhs (new_stmt, scalar_dest_var); 6267 reduc_var = make_ssa_name (scalar_dest_var, new_stmt); 6268 gimple_set_lhs (new_stmt, reduc_var); 6269 } 6270 } 6271 else 6272 { 6273 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code, 6274 reduc_var, def0); 6275 new_stmt = SSA_NAME_DEF_STMT (reduc_var); 6276 /* Remove the statement, so that we can use the same code paths 6277 as for statements that we've just created. */ 6278 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt); 6279 gsi_remove (&tmp_gsi, true); 6280 } 6281 6282 if (i == vec_num - 1) 6283 { 6284 gimple_set_lhs (new_stmt, scalar_dest); 6285 vect_finish_replace_stmt (scalar_dest_def, new_stmt); 6286 } 6287 else 6288 vect_finish_stmt_generation (scalar_dest_def, new_stmt, gsi); 6289 6290 if (slp_node) 6291 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt); 6292 } 6293 6294 if (!slp_node) 6295 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt; 6296 6297 return true; 6298 } 6299 6300 /* Function is_nonwrapping_integer_induction. 6301 6302 Check if STMT (which is part of loop LOOP) both increments and 6303 does not cause overflow. */ 6304 6305 static bool 6306 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop) 6307 { 6308 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt); 6309 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo); 6310 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo); 6311 tree lhs_type = TREE_TYPE (gimple_phi_result (stmt)); 6312 widest_int ni, max_loop_value, lhs_max; 6313 bool overflow = false; 6314 6315 /* Make sure the loop is integer based. */ 6316 if (TREE_CODE (base) != INTEGER_CST 6317 || TREE_CODE (step) != INTEGER_CST) 6318 return false; 6319 6320 /* Check that the max size of the loop will not wrap. */ 6321 6322 if (TYPE_OVERFLOW_UNDEFINED (lhs_type)) 6323 return true; 6324 6325 if (! max_stmt_executions (loop, &ni)) 6326 return false; 6327 6328 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type), 6329 &overflow); 6330 if (overflow) 6331 return false; 6332 6333 max_loop_value = wi::add (wi::to_widest (base), max_loop_value, 6334 TYPE_SIGN (lhs_type), &overflow); 6335 if (overflow) 6336 return false; 6337 6338 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type)) 6339 <= TYPE_PRECISION (lhs_type)); 6340 } 6341 6342 /* Function vectorizable_reduction. 6343 6344 Check if STMT performs a reduction operation that can be vectorized. 6345 If VEC_STMT is also passed, vectorize the STMT: create a vectorized 6346 stmt to replace it, put it in VEC_STMT, and insert it at GSI. 6347 Return FALSE if not a vectorizable STMT, TRUE otherwise. 6348 6349 This function also handles reduction idioms (patterns) that have been 6350 recognized in advance during vect_pattern_recog. In this case, STMT may be 6351 of this form: 6352 X = pattern_expr (arg0, arg1, ..., X) 6353 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original 6354 sequence that had been detected and replaced by the pattern-stmt (STMT). 6355 6356 This function also handles reduction of condition expressions, for example: 6357 for (int i = 0; i < N; i++) 6358 if (a[i] < value) 6359 last = a[i]; 6360 This is handled by vectorising the loop and creating an additional vector 6361 containing the loop indexes for which "a[i] < value" was true. In the 6362 function epilogue this is reduced to a single max value and then used to 6363 index into the vector of results. 6364 6365 In some cases of reduction patterns, the type of the reduction variable X is 6366 different than the type of the other arguments of STMT. 6367 In such cases, the vectype that is used when transforming STMT into a vector 6368 stmt is different than the vectype that is used to determine the 6369 vectorization factor, because it consists of a different number of elements 6370 than the actual number of elements that are being operated upon in parallel. 6371 6372 For example, consider an accumulation of shorts into an int accumulator. 6373 On some targets it's possible to vectorize this pattern operating on 8 6374 shorts at a time (hence, the vectype for purposes of determining the 6375 vectorization factor should be V8HI); on the other hand, the vectype that 6376 is used to create the vector form is actually V4SI (the type of the result). 6377 6378 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that 6379 indicates what is the actual level of parallelism (V8HI in the example), so 6380 that the right vectorization factor would be derived. This vectype 6381 corresponds to the type of arguments to the reduction stmt, and should *NOT* 6382 be used to create the vectorized stmt. The right vectype for the vectorized 6383 stmt is obtained from the type of the result X: 6384 get_vectype_for_scalar_type (TREE_TYPE (X)) 6385 6386 This means that, contrary to "regular" reductions (or "regular" stmts in 6387 general), the following equation: 6388 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X)) 6389 does *NOT* necessarily hold for reduction patterns. */ 6390 6391 bool 6392 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi, 6393 gimple **vec_stmt, slp_tree slp_node, 6394 slp_instance slp_node_instance) 6395 { 6396 tree vec_dest; 6397 tree scalar_dest; 6398 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 6399 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info); 6400 tree vectype_in = NULL_TREE; 6401 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 6402 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 6403 enum tree_code code, orig_code; 6404 internal_fn reduc_fn; 6405 machine_mode vec_mode; 6406 int op_type; 6407 optab optab; 6408 tree new_temp = NULL_TREE; 6409 gimple *def_stmt; 6410 enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type; 6411 gimple *cond_reduc_def_stmt = NULL; 6412 enum tree_code cond_reduc_op_code = ERROR_MARK; 6413 tree scalar_type; 6414 bool is_simple_use; 6415 gimple *orig_stmt; 6416 stmt_vec_info orig_stmt_info = NULL; 6417 int i; 6418 int ncopies; 6419 int epilog_copies; 6420 stmt_vec_info prev_stmt_info, prev_phi_info; 6421 bool single_defuse_cycle = false; 6422 gimple *new_stmt = NULL; 6423 int j; 6424 tree ops[3]; 6425 enum vect_def_type dts[3]; 6426 bool nested_cycle = false, found_nested_cycle_def = false; 6427 bool double_reduc = false; 6428 basic_block def_bb; 6429 struct loop * def_stmt_loop, *outer_loop = NULL; 6430 tree def_arg; 6431 gimple *def_arg_stmt; 6432 auto_vec<tree> vec_oprnds0; 6433 auto_vec<tree> vec_oprnds1; 6434 auto_vec<tree> vec_oprnds2; 6435 auto_vec<tree> vect_defs; 6436 auto_vec<gimple *> phis; 6437 int vec_num; 6438 tree def0, tem; 6439 bool first_p = true; 6440 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE; 6441 tree cond_reduc_val = NULL_TREE; 6442 6443 /* Make sure it was already recognized as a reduction computation. */ 6444 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def 6445 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle) 6446 return false; 6447 6448 if (nested_in_vect_loop_p (loop, stmt)) 6449 { 6450 outer_loop = loop; 6451 loop = loop->inner; 6452 nested_cycle = true; 6453 } 6454 6455 /* In case of reduction chain we switch to the first stmt in the chain, but 6456 we don't update STMT_INFO, since only the last stmt is marked as reduction 6457 and has reduction properties. */ 6458 if (GROUP_FIRST_ELEMENT (stmt_info) 6459 && GROUP_FIRST_ELEMENT (stmt_info) != stmt) 6460 { 6461 stmt = GROUP_FIRST_ELEMENT (stmt_info); 6462 first_p = false; 6463 } 6464 6465 if (gimple_code (stmt) == GIMPLE_PHI) 6466 { 6467 /* Analysis is fully done on the reduction stmt invocation. */ 6468 if (! vec_stmt) 6469 { 6470 if (slp_node) 6471 slp_node_instance->reduc_phis = slp_node; 6472 6473 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type; 6474 return true; 6475 } 6476 6477 if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION) 6478 /* Leave the scalar phi in place. Note that checking 6479 STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works 6480 for reductions involving a single statement. */ 6481 return true; 6482 6483 gimple *reduc_stmt = STMT_VINFO_REDUC_DEF (stmt_info); 6484 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (reduc_stmt))) 6485 reduc_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (reduc_stmt)); 6486 6487 if (STMT_VINFO_VEC_REDUCTION_TYPE (vinfo_for_stmt (reduc_stmt)) 6488 == EXTRACT_LAST_REDUCTION) 6489 /* Leave the scalar phi in place. */ 6490 return true; 6491 6492 gcc_assert (is_gimple_assign (reduc_stmt)); 6493 for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k) 6494 { 6495 tree op = gimple_op (reduc_stmt, k); 6496 if (op == gimple_phi_result (stmt)) 6497 continue; 6498 if (k == 1 6499 && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR) 6500 continue; 6501 if (!vectype_in 6502 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in))) 6503 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op))))) 6504 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op)); 6505 break; 6506 } 6507 gcc_assert (vectype_in); 6508 6509 if (slp_node) 6510 ncopies = 1; 6511 else 6512 ncopies = vect_get_num_copies (loop_vinfo, vectype_in); 6513 6514 use_operand_p use_p; 6515 gimple *use_stmt; 6516 if (ncopies > 1 6517 && (STMT_VINFO_RELEVANT (vinfo_for_stmt (reduc_stmt)) 6518 <= vect_used_only_live) 6519 && single_imm_use (gimple_phi_result (stmt), &use_p, &use_stmt) 6520 && (use_stmt == reduc_stmt 6521 || (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt)) 6522 == reduc_stmt))) 6523 single_defuse_cycle = true; 6524 6525 /* Create the destination vector */ 6526 scalar_dest = gimple_assign_lhs (reduc_stmt); 6527 vec_dest = vect_create_destination_var (scalar_dest, vectype_out); 6528 6529 if (slp_node) 6530 /* The size vect_schedule_slp_instance computes is off for us. */ 6531 vec_num = vect_get_num_vectors 6532 (LOOP_VINFO_VECT_FACTOR (loop_vinfo) 6533 * SLP_TREE_SCALAR_STMTS (slp_node).length (), 6534 vectype_in); 6535 else 6536 vec_num = 1; 6537 6538 /* Generate the reduction PHIs upfront. */ 6539 prev_phi_info = NULL; 6540 for (j = 0; j < ncopies; j++) 6541 { 6542 if (j == 0 || !single_defuse_cycle) 6543 { 6544 for (i = 0; i < vec_num; i++) 6545 { 6546 /* Create the reduction-phi that defines the reduction 6547 operand. */ 6548 gimple *new_phi = create_phi_node (vec_dest, loop->header); 6549 set_vinfo_for_stmt (new_phi, 6550 new_stmt_vec_info (new_phi, loop_vinfo)); 6551 6552 if (slp_node) 6553 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi); 6554 else 6555 { 6556 if (j == 0) 6557 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi; 6558 else 6559 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi; 6560 prev_phi_info = vinfo_for_stmt (new_phi); 6561 } 6562 } 6563 } 6564 } 6565 6566 return true; 6567 } 6568 6569 /* 1. Is vectorizable reduction? */ 6570 /* Not supportable if the reduction variable is used in the loop, unless 6571 it's a reduction chain. */ 6572 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer 6573 && !GROUP_FIRST_ELEMENT (stmt_info)) 6574 return false; 6575 6576 /* Reductions that are not used even in an enclosing outer-loop, 6577 are expected to be "live" (used out of the loop). */ 6578 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope 6579 && !STMT_VINFO_LIVE_P (stmt_info)) 6580 return false; 6581 6582 /* 2. Has this been recognized as a reduction pattern? 6583 6584 Check if STMT represents a pattern that has been recognized 6585 in earlier analysis stages. For stmts that represent a pattern, 6586 the STMT_VINFO_RELATED_STMT field records the last stmt in 6587 the original sequence that constitutes the pattern. */ 6588 6589 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt)); 6590 if (orig_stmt) 6591 { 6592 orig_stmt_info = vinfo_for_stmt (orig_stmt); 6593 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)); 6594 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info)); 6595 } 6596 6597 /* 3. Check the operands of the operation. The first operands are defined 6598 inside the loop body. The last operand is the reduction variable, 6599 which is defined by the loop-header-phi. */ 6600 6601 gcc_assert (is_gimple_assign (stmt)); 6602 6603 /* Flatten RHS. */ 6604 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt))) 6605 { 6606 case GIMPLE_BINARY_RHS: 6607 code = gimple_assign_rhs_code (stmt); 6608 op_type = TREE_CODE_LENGTH (code); 6609 gcc_assert (op_type == binary_op); 6610 ops[0] = gimple_assign_rhs1 (stmt); 6611 ops[1] = gimple_assign_rhs2 (stmt); 6612 break; 6613 6614 case GIMPLE_TERNARY_RHS: 6615 code = gimple_assign_rhs_code (stmt); 6616 op_type = TREE_CODE_LENGTH (code); 6617 gcc_assert (op_type == ternary_op); 6618 ops[0] = gimple_assign_rhs1 (stmt); 6619 ops[1] = gimple_assign_rhs2 (stmt); 6620 ops[2] = gimple_assign_rhs3 (stmt); 6621 break; 6622 6623 case GIMPLE_UNARY_RHS: 6624 return false; 6625 6626 default: 6627 gcc_unreachable (); 6628 } 6629 6630 if (code == COND_EXPR && slp_node) 6631 return false; 6632 6633 scalar_dest = gimple_assign_lhs (stmt); 6634 scalar_type = TREE_TYPE (scalar_dest); 6635 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type) 6636 && !SCALAR_FLOAT_TYPE_P (scalar_type)) 6637 return false; 6638 6639 /* Do not try to vectorize bit-precision reductions. */ 6640 if (!type_has_mode_precision_p (scalar_type)) 6641 return false; 6642 6643 /* All uses but the last are expected to be defined in the loop. 6644 The last use is the reduction variable. In case of nested cycle this 6645 assumption is not true: we use reduc_index to record the index of the 6646 reduction variable. */ 6647 gimple *reduc_def_stmt = NULL; 6648 int reduc_index = -1; 6649 for (i = 0; i < op_type; i++) 6650 { 6651 /* The condition of COND_EXPR is checked in vectorizable_condition(). */ 6652 if (i == 0 && code == COND_EXPR) 6653 continue; 6654 6655 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, 6656 &def_stmt, &dts[i], &tem); 6657 dt = dts[i]; 6658 gcc_assert (is_simple_use); 6659 if (dt == vect_reduction_def) 6660 { 6661 reduc_def_stmt = def_stmt; 6662 reduc_index = i; 6663 continue; 6664 } 6665 else if (tem) 6666 { 6667 /* To properly compute ncopies we are interested in the widest 6668 input type in case we're looking at a widening accumulation. */ 6669 if (!vectype_in 6670 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in))) 6671 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem))))) 6672 vectype_in = tem; 6673 } 6674 6675 if (dt != vect_internal_def 6676 && dt != vect_external_def 6677 && dt != vect_constant_def 6678 && dt != vect_induction_def 6679 && !(dt == vect_nested_cycle && nested_cycle)) 6680 return false; 6681 6682 if (dt == vect_nested_cycle) 6683 { 6684 found_nested_cycle_def = true; 6685 reduc_def_stmt = def_stmt; 6686 reduc_index = i; 6687 } 6688 6689 if (i == 1 && code == COND_EXPR) 6690 { 6691 /* Record how value of COND_EXPR is defined. */ 6692 if (dt == vect_constant_def) 6693 { 6694 cond_reduc_dt = dt; 6695 cond_reduc_val = ops[i]; 6696 } 6697 if (dt == vect_induction_def 6698 && def_stmt != NULL 6699 && is_nonwrapping_integer_induction (def_stmt, loop)) 6700 { 6701 cond_reduc_dt = dt; 6702 cond_reduc_def_stmt = def_stmt; 6703 } 6704 } 6705 } 6706 6707 if (!vectype_in) 6708 vectype_in = vectype_out; 6709 6710 /* When vectorizing a reduction chain w/o SLP the reduction PHI is not 6711 directy used in stmt. */ 6712 if (reduc_index == -1) 6713 { 6714 if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION) 6715 { 6716 if (dump_enabled_p ()) 6717 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6718 "in-order reduction chain without SLP.\n"); 6719 return false; 6720 } 6721 6722 if (orig_stmt) 6723 reduc_def_stmt = STMT_VINFO_REDUC_DEF (orig_stmt_info); 6724 else 6725 reduc_def_stmt = STMT_VINFO_REDUC_DEF (stmt_info); 6726 } 6727 6728 if (! reduc_def_stmt || gimple_code (reduc_def_stmt) != GIMPLE_PHI) 6729 return false; 6730 6731 if (!(reduc_index == -1 6732 || dts[reduc_index] == vect_reduction_def 6733 || dts[reduc_index] == vect_nested_cycle 6734 || ((dts[reduc_index] == vect_internal_def 6735 || dts[reduc_index] == vect_external_def 6736 || dts[reduc_index] == vect_constant_def 6737 || dts[reduc_index] == vect_induction_def) 6738 && nested_cycle && found_nested_cycle_def))) 6739 { 6740 /* For pattern recognized stmts, orig_stmt might be a reduction, 6741 but some helper statements for the pattern might not, or 6742 might be COND_EXPRs with reduction uses in the condition. */ 6743 gcc_assert (orig_stmt); 6744 return false; 6745 } 6746 6747 stmt_vec_info reduc_def_info = vinfo_for_stmt (reduc_def_stmt); 6748 enum vect_reduction_type v_reduc_type 6749 = STMT_VINFO_REDUC_TYPE (reduc_def_info); 6750 gimple *tmp = STMT_VINFO_REDUC_DEF (reduc_def_info); 6751 6752 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type; 6753 /* If we have a condition reduction, see if we can simplify it further. */ 6754 if (v_reduc_type == COND_REDUCTION) 6755 { 6756 /* TODO: We can't yet handle reduction chains, since we need to treat 6757 each COND_EXPR in the chain specially, not just the last one. 6758 E.g. for: 6759 6760 x_1 = PHI <x_3, ...> 6761 x_2 = a_2 ? ... : x_1; 6762 x_3 = a_3 ? ... : x_2; 6763 6764 we're interested in the last element in x_3 for which a_2 || a_3 6765 is true, whereas the current reduction chain handling would 6766 vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3 6767 as a reduction operation. */ 6768 if (reduc_index == -1) 6769 { 6770 if (dump_enabled_p ()) 6771 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6772 "conditional reduction chains not supported\n"); 6773 return false; 6774 } 6775 6776 /* vect_is_simple_reduction ensured that operand 2 is the 6777 loop-carried operand. */ 6778 gcc_assert (reduc_index == 2); 6779 6780 /* Loop peeling modifies initial value of reduction PHI, which 6781 makes the reduction stmt to be transformed different to the 6782 original stmt analyzed. We need to record reduction code for 6783 CONST_COND_REDUCTION type reduction at analyzing stage, thus 6784 it can be used directly at transform stage. */ 6785 if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR 6786 || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR) 6787 { 6788 /* Also set the reduction type to CONST_COND_REDUCTION. */ 6789 gcc_assert (cond_reduc_dt == vect_constant_def); 6790 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION; 6791 } 6792 else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST, 6793 vectype_in, OPTIMIZE_FOR_SPEED)) 6794 { 6795 if (dump_enabled_p ()) 6796 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6797 "optimizing condition reduction with" 6798 " FOLD_EXTRACT_LAST.\n"); 6799 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION; 6800 } 6801 else if (cond_reduc_dt == vect_induction_def) 6802 { 6803 stmt_vec_info cond_stmt_vinfo = vinfo_for_stmt (cond_reduc_def_stmt); 6804 tree base 6805 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo); 6806 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo); 6807 6808 gcc_assert (TREE_CODE (base) == INTEGER_CST 6809 && TREE_CODE (step) == INTEGER_CST); 6810 cond_reduc_val = NULL_TREE; 6811 tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo)); 6812 if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base))) 6813 ; 6814 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR 6815 above base; punt if base is the minimum value of the type for 6816 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */ 6817 else if (tree_int_cst_sgn (step) == -1) 6818 { 6819 cond_reduc_op_code = MIN_EXPR; 6820 if (tree_int_cst_sgn (base) == -1) 6821 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0); 6822 else if (tree_int_cst_lt (base, 6823 TYPE_MAX_VALUE (TREE_TYPE (base)))) 6824 cond_reduc_val 6825 = int_const_binop (PLUS_EXPR, base, integer_one_node); 6826 } 6827 else 6828 { 6829 cond_reduc_op_code = MAX_EXPR; 6830 if (tree_int_cst_sgn (base) == 1) 6831 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0); 6832 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)), 6833 base)) 6834 cond_reduc_val 6835 = int_const_binop (MINUS_EXPR, base, integer_one_node); 6836 } 6837 if (cond_reduc_val) 6838 { 6839 if (dump_enabled_p ()) 6840 dump_printf_loc (MSG_NOTE, vect_location, 6841 "condition expression based on " 6842 "integer induction.\n"); 6843 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) 6844 = INTEGER_INDUC_COND_REDUCTION; 6845 } 6846 } 6847 else if (cond_reduc_dt == vect_constant_def) 6848 { 6849 enum vect_def_type cond_initial_dt; 6850 gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]); 6851 tree cond_initial_val 6852 = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop)); 6853 6854 gcc_assert (cond_reduc_val != NULL_TREE); 6855 vect_is_simple_use (cond_initial_val, loop_vinfo, 6856 &def_stmt, &cond_initial_dt); 6857 if (cond_initial_dt == vect_constant_def 6858 && types_compatible_p (TREE_TYPE (cond_initial_val), 6859 TREE_TYPE (cond_reduc_val))) 6860 { 6861 tree e = fold_binary (LE_EXPR, boolean_type_node, 6862 cond_initial_val, cond_reduc_val); 6863 if (e && (integer_onep (e) || integer_zerop (e))) 6864 { 6865 if (dump_enabled_p ()) 6866 dump_printf_loc (MSG_NOTE, vect_location, 6867 "condition expression based on " 6868 "compile time constant.\n"); 6869 /* Record reduction code at analysis stage. */ 6870 STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) 6871 = integer_onep (e) ? MAX_EXPR : MIN_EXPR; 6872 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) 6873 = CONST_COND_REDUCTION; 6874 } 6875 } 6876 } 6877 } 6878 6879 if (orig_stmt) 6880 gcc_assert (tmp == orig_stmt 6881 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt); 6882 else 6883 /* We changed STMT to be the first stmt in reduction chain, hence we 6884 check that in this case the first element in the chain is STMT. */ 6885 gcc_assert (stmt == tmp 6886 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt); 6887 6888 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt))) 6889 return false; 6890 6891 if (slp_node) 6892 ncopies = 1; 6893 else 6894 ncopies = vect_get_num_copies (loop_vinfo, vectype_in); 6895 6896 gcc_assert (ncopies >= 1); 6897 6898 vec_mode = TYPE_MODE (vectype_in); 6899 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out); 6900 6901 if (code == COND_EXPR) 6902 { 6903 /* Only call during the analysis stage, otherwise we'll lose 6904 STMT_VINFO_TYPE. */ 6905 if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL, 6906 ops[reduc_index], 0, NULL)) 6907 { 6908 if (dump_enabled_p ()) 6909 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6910 "unsupported condition in reduction\n"); 6911 return false; 6912 } 6913 } 6914 else 6915 { 6916 /* 4. Supportable by target? */ 6917 6918 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR 6919 || code == LROTATE_EXPR || code == RROTATE_EXPR) 6920 { 6921 /* Shifts and rotates are only supported by vectorizable_shifts, 6922 not vectorizable_reduction. */ 6923 if (dump_enabled_p ()) 6924 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6925 "unsupported shift or rotation.\n"); 6926 return false; 6927 } 6928 6929 /* 4.1. check support for the operation in the loop */ 6930 optab = optab_for_tree_code (code, vectype_in, optab_default); 6931 if (!optab) 6932 { 6933 if (dump_enabled_p ()) 6934 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6935 "no optab.\n"); 6936 6937 return false; 6938 } 6939 6940 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing) 6941 { 6942 if (dump_enabled_p ()) 6943 dump_printf (MSG_NOTE, "op not supported by target.\n"); 6944 6945 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD) 6946 || !vect_worthwhile_without_simd_p (loop_vinfo, code)) 6947 return false; 6948 6949 if (dump_enabled_p ()) 6950 dump_printf (MSG_NOTE, "proceeding using word mode.\n"); 6951 } 6952 6953 /* Worthwhile without SIMD support? */ 6954 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in)) 6955 && !vect_worthwhile_without_simd_p (loop_vinfo, code)) 6956 { 6957 if (dump_enabled_p ()) 6958 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6959 "not worthwhile without SIMD support.\n"); 6960 6961 return false; 6962 } 6963 } 6964 6965 /* 4.2. Check support for the epilog operation. 6966 6967 If STMT represents a reduction pattern, then the type of the 6968 reduction variable may be different than the type of the rest 6969 of the arguments. For example, consider the case of accumulation 6970 of shorts into an int accumulator; The original code: 6971 S1: int_a = (int) short_a; 6972 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>; 6973 6974 was replaced with: 6975 STMT: int_acc = widen_sum <short_a, int_acc> 6976 6977 This means that: 6978 1. The tree-code that is used to create the vector operation in the 6979 epilog code (that reduces the partial results) is not the 6980 tree-code of STMT, but is rather the tree-code of the original 6981 stmt from the pattern that STMT is replacing. I.e, in the example 6982 above we want to use 'widen_sum' in the loop, but 'plus' in the 6983 epilog. 6984 2. The type (mode) we use to check available target support 6985 for the vector operation to be created in the *epilog*, is 6986 determined by the type of the reduction variable (in the example 6987 above we'd check this: optab_handler (plus_optab, vect_int_mode])). 6988 However the type (mode) we use to check available target support 6989 for the vector operation to be created *inside the loop*, is 6990 determined by the type of the other arguments to STMT (in the 6991 example we'd check this: optab_handler (widen_sum_optab, 6992 vect_short_mode)). 6993 6994 This is contrary to "regular" reductions, in which the types of all 6995 the arguments are the same as the type of the reduction variable. 6996 For "regular" reductions we can therefore use the same vector type 6997 (and also the same tree-code) when generating the epilog code and 6998 when generating the code inside the loop. */ 6999 7000 vect_reduction_type reduction_type 7001 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info); 7002 if (orig_stmt 7003 && (reduction_type == TREE_CODE_REDUCTION 7004 || reduction_type == FOLD_LEFT_REDUCTION)) 7005 { 7006 /* This is a reduction pattern: get the vectype from the type of the 7007 reduction variable, and get the tree-code from orig_stmt. */ 7008 orig_code = gimple_assign_rhs_code (orig_stmt); 7009 gcc_assert (vectype_out); 7010 vec_mode = TYPE_MODE (vectype_out); 7011 } 7012 else 7013 { 7014 /* Regular reduction: use the same vectype and tree-code as used for 7015 the vector code inside the loop can be used for the epilog code. */ 7016 orig_code = code; 7017 7018 if (code == MINUS_EXPR) 7019 orig_code = PLUS_EXPR; 7020 7021 /* For simple condition reductions, replace with the actual expression 7022 we want to base our reduction around. */ 7023 if (reduction_type == CONST_COND_REDUCTION) 7024 { 7025 orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info); 7026 gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR); 7027 } 7028 else if (reduction_type == INTEGER_INDUC_COND_REDUCTION) 7029 orig_code = cond_reduc_op_code; 7030 } 7031 7032 if (nested_cycle) 7033 { 7034 def_bb = gimple_bb (reduc_def_stmt); 7035 def_stmt_loop = def_bb->loop_father; 7036 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt, 7037 loop_preheader_edge (def_stmt_loop)); 7038 if (TREE_CODE (def_arg) == SSA_NAME 7039 && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg)) 7040 && gimple_code (def_arg_stmt) == GIMPLE_PHI 7041 && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt)) 7042 && vinfo_for_stmt (def_arg_stmt) 7043 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt)) 7044 == vect_double_reduction_def) 7045 double_reduc = true; 7046 } 7047 7048 reduc_fn = IFN_LAST; 7049 7050 if (reduction_type == TREE_CODE_REDUCTION 7051 || reduction_type == FOLD_LEFT_REDUCTION 7052 || reduction_type == INTEGER_INDUC_COND_REDUCTION 7053 || reduction_type == CONST_COND_REDUCTION) 7054 { 7055 if (reduction_type == FOLD_LEFT_REDUCTION 7056 ? fold_left_reduction_fn (orig_code, &reduc_fn) 7057 : reduction_fn_for_scalar_code (orig_code, &reduc_fn)) 7058 { 7059 if (reduc_fn != IFN_LAST 7060 && !direct_internal_fn_supported_p (reduc_fn, vectype_out, 7061 OPTIMIZE_FOR_SPEED)) 7062 { 7063 if (dump_enabled_p ()) 7064 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 7065 "reduc op not supported by target.\n"); 7066 7067 reduc_fn = IFN_LAST; 7068 } 7069 } 7070 else 7071 { 7072 if (!nested_cycle || double_reduc) 7073 { 7074 if (dump_enabled_p ()) 7075 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 7076 "no reduc code for scalar code.\n"); 7077 7078 return false; 7079 } 7080 } 7081 } 7082 else if (reduction_type == COND_REDUCTION) 7083 { 7084 int scalar_precision 7085 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type)); 7086 cr_index_scalar_type = make_unsigned_type (scalar_precision); 7087 cr_index_vector_type = build_vector_type (cr_index_scalar_type, 7088 nunits_out); 7089 7090 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type, 7091 OPTIMIZE_FOR_SPEED)) 7092 reduc_fn = IFN_REDUC_MAX; 7093 } 7094 7095 if (reduction_type != EXTRACT_LAST_REDUCTION 7096 && reduc_fn == IFN_LAST 7097 && !nunits_out.is_constant ()) 7098 { 7099 if (dump_enabled_p ()) 7100 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 7101 "missing target support for reduction on" 7102 " variable-length vectors.\n"); 7103 return false; 7104 } 7105 7106 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION) 7107 && ncopies > 1) 7108 { 7109 if (dump_enabled_p ()) 7110 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 7111 "multiple types in double reduction or condition " 7112 "reduction.\n"); 7113 return false; 7114 } 7115 7116 /* For SLP reductions, see if there is a neutral value we can use. */ 7117 tree neutral_op = NULL_TREE; 7118 if (slp_node) 7119 neutral_op 7120 = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis, code, 7121 GROUP_FIRST_ELEMENT (stmt_info) != NULL); 7122 7123 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION) 7124 { 7125 /* We can't support in-order reductions of code such as this: 7126 7127 for (int i = 0; i < n1; ++i) 7128 for (int j = 0; j < n2; ++j) 7129 l += a[j]; 7130 7131 since GCC effectively transforms the loop when vectorizing: 7132 7133 for (int i = 0; i < n1 / VF; ++i) 7134 for (int j = 0; j < n2; ++j) 7135 for (int k = 0; k < VF; ++k) 7136 l += a[j]; 7137 7138 which is a reassociation of the original operation. */ 7139 if (dump_enabled_p ()) 7140 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 7141 "in-order double reduction not supported.\n"); 7142 7143 return false; 7144 } 7145 7146 if (reduction_type == FOLD_LEFT_REDUCTION 7147 && slp_node 7148 && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))) 7149 { 7150 /* We cannot use in-order reductions in this case because there is 7151 an implicit reassociation of the operations involved. */ 7152 if (dump_enabled_p ()) 7153 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 7154 "in-order unchained SLP reductions not supported.\n"); 7155 return false; 7156 } 7157 7158 /* For double reductions, and for SLP reductions with a neutral value, 7159 we construct a variable-length initial vector by loading a vector 7160 full of the neutral value and then shift-and-inserting the start 7161 values into the low-numbered elements. */ 7162 if ((double_reduc || neutral_op) 7163 && !nunits_out.is_constant () 7164 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT, 7165 vectype_out, OPTIMIZE_FOR_SPEED)) 7166 { 7167 if (dump_enabled_p ()) 7168 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 7169 "reduction on variable-length vectors requires" 7170 " target support for a vector-shift-and-insert" 7171 " operation.\n"); 7172 return false; 7173 } 7174 7175 /* Check extra constraints for variable-length unchained SLP reductions. */ 7176 if (STMT_SLP_TYPE (stmt_info) 7177 && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) 7178 && !nunits_out.is_constant ()) 7179 { 7180 /* We checked above that we could build the initial vector when 7181 there's a neutral element value. Check here for the case in 7182 which each SLP statement has its own initial value and in which 7183 that value needs to be repeated for every instance of the 7184 statement within the initial vector. */ 7185 unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length (); 7186 scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out)); 7187 if (!neutral_op 7188 && !can_duplicate_and_interleave_p (group_size, elt_mode)) 7189 { 7190 if (dump_enabled_p ()) 7191 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 7192 "unsupported form of SLP reduction for" 7193 " variable-length vectors: cannot build" 7194 " initial vector.\n"); 7195 return false; 7196 } 7197 /* The epilogue code relies on the number of elements being a multiple 7198 of the group size. The duplicate-and-interleave approach to setting 7199 up the the initial vector does too. */ 7200 if (!multiple_p (nunits_out, group_size)) 7201 { 7202 if (dump_enabled_p ()) 7203 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 7204 "unsupported form of SLP reduction for" 7205 " variable-length vectors: the vector size" 7206 " is not a multiple of the number of results.\n"); 7207 return false; 7208 } 7209 } 7210 7211 /* In case of widenning multiplication by a constant, we update the type 7212 of the constant to be the type of the other operand. We check that the 7213 constant fits the type in the pattern recognition pass. */ 7214 if (code == DOT_PROD_EXPR 7215 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1]))) 7216 { 7217 if (TREE_CODE (ops[0]) == INTEGER_CST) 7218 ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]); 7219 else if (TREE_CODE (ops[1]) == INTEGER_CST) 7220 ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]); 7221 else 7222 { 7223 if (dump_enabled_p ()) 7224 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 7225 "invalid types in dot-prod\n"); 7226 7227 return false; 7228 } 7229 } 7230 7231 if (reduction_type == COND_REDUCTION) 7232 { 7233 widest_int ni; 7234 7235 if (! max_loop_iterations (loop, &ni)) 7236 { 7237 if (dump_enabled_p ()) 7238 dump_printf_loc (MSG_NOTE, vect_location, 7239 "loop count not known, cannot create cond " 7240 "reduction.\n"); 7241 return false; 7242 } 7243 /* Convert backedges to iterations. */ 7244 ni += 1; 7245 7246 /* The additional index will be the same type as the condition. Check 7247 that the loop can fit into this less one (because we'll use up the 7248 zero slot for when there are no matches). */ 7249 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type); 7250 if (wi::geu_p (ni, wi::to_widest (max_index))) 7251 { 7252 if (dump_enabled_p ()) 7253 dump_printf_loc (MSG_NOTE, vect_location, 7254 "loop size is greater than data size.\n"); 7255 return false; 7256 } 7257 } 7258 7259 /* In case the vectorization factor (VF) is bigger than the number 7260 of elements that we can fit in a vectype (nunits), we have to generate 7261 more than one vector stmt - i.e - we need to "unroll" the 7262 vector stmt by a factor VF/nunits. For more details see documentation 7263 in vectorizable_operation. */ 7264 7265 /* If the reduction is used in an outer loop we need to generate 7266 VF intermediate results, like so (e.g. for ncopies=2): 7267 r0 = phi (init, r0) 7268 r1 = phi (init, r1) 7269 r0 = x0 + r0; 7270 r1 = x1 + r1; 7271 (i.e. we generate VF results in 2 registers). 7272 In this case we have a separate def-use cycle for each copy, and therefore 7273 for each copy we get the vector def for the reduction variable from the 7274 respective phi node created for this copy. 7275 7276 Otherwise (the reduction is unused in the loop nest), we can combine 7277 together intermediate results, like so (e.g. for ncopies=2): 7278 r = phi (init, r) 7279 r = x0 + r; 7280 r = x1 + r; 7281 (i.e. we generate VF/2 results in a single register). 7282 In this case for each copy we get the vector def for the reduction variable 7283 from the vectorized reduction operation generated in the previous iteration. 7284 7285 This only works when we see both the reduction PHI and its only consumer 7286 in vectorizable_reduction and there are no intermediate stmts 7287 participating. */ 7288 use_operand_p use_p; 7289 gimple *use_stmt; 7290 if (ncopies > 1 7291 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live) 7292 && single_imm_use (gimple_phi_result (reduc_def_stmt), &use_p, &use_stmt) 7293 && (use_stmt == stmt 7294 || STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt)) == stmt)) 7295 { 7296 single_defuse_cycle = true; 7297 epilog_copies = 1; 7298 } 7299 else 7300 epilog_copies = ncopies; 7301 7302 /* If the reduction stmt is one of the patterns that have lane 7303 reduction embedded we cannot handle the case of ! single_defuse_cycle. */ 7304 if ((ncopies > 1 7305 && ! single_defuse_cycle) 7306 && (code == DOT_PROD_EXPR 7307 || code == WIDEN_SUM_EXPR 7308 || code == SAD_EXPR)) 7309 { 7310 if (dump_enabled_p ()) 7311 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 7312 "multi def-use cycle not possible for lane-reducing " 7313 "reduction operation\n"); 7314 return false; 7315 } 7316 7317 if (slp_node) 7318 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); 7319 else 7320 vec_num = 1; 7321 7322 internal_fn cond_fn = get_conditional_internal_fn (code); 7323 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo); 7324 7325 if (!vec_stmt) /* transformation not required. */ 7326 { 7327 if (first_p) 7328 vect_model_reduction_cost (stmt_info, reduc_fn, ncopies); 7329 if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)) 7330 { 7331 if (reduction_type != FOLD_LEFT_REDUCTION 7332 && (cond_fn == IFN_LAST 7333 || !direct_internal_fn_supported_p (cond_fn, vectype_in, 7334 OPTIMIZE_FOR_SPEED))) 7335 { 7336 if (dump_enabled_p ()) 7337 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 7338 "can't use a fully-masked loop because no" 7339 " conditional operation is available.\n"); 7340 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false; 7341 } 7342 else if (reduc_index == -1) 7343 { 7344 if (dump_enabled_p ()) 7345 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 7346 "can't use a fully-masked loop for chained" 7347 " reductions.\n"); 7348 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false; 7349 } 7350 else 7351 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num, 7352 vectype_in); 7353 } 7354 if (dump_enabled_p () 7355 && reduction_type == FOLD_LEFT_REDUCTION) 7356 dump_printf_loc (MSG_NOTE, vect_location, 7357 "using an in-order (fold-left) reduction.\n"); 7358 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type; 7359 return true; 7360 } 7361 7362 /* Transform. */ 7363 7364 if (dump_enabled_p ()) 7365 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n"); 7366 7367 /* FORNOW: Multiple types are not supported for condition. */ 7368 if (code == COND_EXPR) 7369 gcc_assert (ncopies == 1); 7370 7371 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo); 7372 7373 if (reduction_type == FOLD_LEFT_REDUCTION) 7374 return vectorize_fold_left_reduction 7375 (stmt, gsi, vec_stmt, slp_node, reduc_def_stmt, code, 7376 reduc_fn, ops, vectype_in, reduc_index, masks); 7377 7378 if (reduction_type == EXTRACT_LAST_REDUCTION) 7379 { 7380 gcc_assert (!slp_node); 7381 return vectorizable_condition (stmt, gsi, vec_stmt, 7382 NULL, reduc_index, NULL); 7383 } 7384 7385 /* Create the destination vector */ 7386 vec_dest = vect_create_destination_var (scalar_dest, vectype_out); 7387 7388 prev_stmt_info = NULL; 7389 prev_phi_info = NULL; 7390 if (!slp_node) 7391 { 7392 vec_oprnds0.create (1); 7393 vec_oprnds1.create (1); 7394 if (op_type == ternary_op) 7395 vec_oprnds2.create (1); 7396 } 7397 7398 phis.create (vec_num); 7399 vect_defs.create (vec_num); 7400 if (!slp_node) 7401 vect_defs.quick_push (NULL_TREE); 7402 7403 if (slp_node) 7404 phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis)); 7405 else 7406 phis.quick_push (STMT_VINFO_VEC_STMT (vinfo_for_stmt (reduc_def_stmt))); 7407 7408 for (j = 0; j < ncopies; j++) 7409 { 7410 if (code == COND_EXPR) 7411 { 7412 gcc_assert (!slp_node); 7413 vectorizable_condition (stmt, gsi, vec_stmt, 7414 PHI_RESULT (phis[0]), 7415 reduc_index, NULL); 7416 /* Multiple types are not supported for condition. */ 7417 break; 7418 } 7419 7420 /* Handle uses. */ 7421 if (j == 0) 7422 { 7423 if (slp_node) 7424 { 7425 /* Get vec defs for all the operands except the reduction index, 7426 ensuring the ordering of the ops in the vector is kept. */ 7427 auto_vec<tree, 3> slp_ops; 7428 auto_vec<vec<tree>, 3> vec_defs; 7429 7430 slp_ops.quick_push (ops[0]); 7431 slp_ops.quick_push (ops[1]); 7432 if (op_type == ternary_op) 7433 slp_ops.quick_push (ops[2]); 7434 7435 vect_get_slp_defs (slp_ops, slp_node, &vec_defs); 7436 7437 vec_oprnds0.safe_splice (vec_defs[0]); 7438 vec_defs[0].release (); 7439 vec_oprnds1.safe_splice (vec_defs[1]); 7440 vec_defs[1].release (); 7441 if (op_type == ternary_op) 7442 { 7443 vec_oprnds2.safe_splice (vec_defs[2]); 7444 vec_defs[2].release (); 7445 } 7446 } 7447 else 7448 { 7449 vec_oprnds0.quick_push 7450 (vect_get_vec_def_for_operand (ops[0], stmt)); 7451 vec_oprnds1.quick_push 7452 (vect_get_vec_def_for_operand (ops[1], stmt)); 7453 if (op_type == ternary_op) 7454 vec_oprnds2.quick_push 7455 (vect_get_vec_def_for_operand (ops[2], stmt)); 7456 } 7457 } 7458 else 7459 { 7460 if (!slp_node) 7461 { 7462 gcc_assert (reduc_index != -1 || ! single_defuse_cycle); 7463 7464 if (single_defuse_cycle && reduc_index == 0) 7465 vec_oprnds0[0] = gimple_get_lhs (new_stmt); 7466 else 7467 vec_oprnds0[0] 7468 = vect_get_vec_def_for_stmt_copy (dts[0], vec_oprnds0[0]); 7469 if (single_defuse_cycle && reduc_index == 1) 7470 vec_oprnds1[0] = gimple_get_lhs (new_stmt); 7471 else 7472 vec_oprnds1[0] 7473 = vect_get_vec_def_for_stmt_copy (dts[1], vec_oprnds1[0]); 7474 if (op_type == ternary_op) 7475 { 7476 if (single_defuse_cycle && reduc_index == 2) 7477 vec_oprnds2[0] = gimple_get_lhs (new_stmt); 7478 else 7479 vec_oprnds2[0] 7480 = vect_get_vec_def_for_stmt_copy (dts[2], vec_oprnds2[0]); 7481 } 7482 } 7483 } 7484 7485 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0) 7486 { 7487 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE }; 7488 if (masked_loop_p) 7489 { 7490 /* Make sure that the reduction accumulator is vop[0]. */ 7491 if (reduc_index == 1) 7492 { 7493 gcc_assert (commutative_tree_code (code)); 7494 std::swap (vop[0], vop[1]); 7495 } 7496 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies, 7497 vectype_in, i * ncopies + j); 7498 gcall *call = gimple_build_call_internal (cond_fn, 3, mask, 7499 vop[0], vop[1]); 7500 new_temp = make_ssa_name (vec_dest, call); 7501 gimple_call_set_lhs (call, new_temp); 7502 gimple_call_set_nothrow (call, true); 7503 new_stmt = call; 7504 } 7505 else 7506 { 7507 if (op_type == ternary_op) 7508 vop[2] = vec_oprnds2[i]; 7509 7510 new_temp = make_ssa_name (vec_dest, new_stmt); 7511 new_stmt = gimple_build_assign (new_temp, code, 7512 vop[0], vop[1], vop[2]); 7513 } 7514 vect_finish_stmt_generation (stmt, new_stmt, gsi); 7515 7516 if (slp_node) 7517 { 7518 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt); 7519 vect_defs.quick_push (new_temp); 7520 } 7521 else 7522 vect_defs[0] = new_temp; 7523 } 7524 7525 if (slp_node) 7526 continue; 7527 7528 if (j == 0) 7529 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt; 7530 else 7531 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt; 7532 7533 prev_stmt_info = vinfo_for_stmt (new_stmt); 7534 } 7535 7536 /* Finalize the reduction-phi (set its arguments) and create the 7537 epilog reduction code. */ 7538 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node) 7539 vect_defs[0] = gimple_get_lhs (*vec_stmt); 7540 7541 vect_create_epilog_for_reduction (vect_defs, stmt, reduc_def_stmt, 7542 epilog_copies, reduc_fn, phis, 7543 double_reduc, slp_node, slp_node_instance, 7544 cond_reduc_val, cond_reduc_op_code, 7545 neutral_op); 7546 7547 return true; 7548 } 7549 7550 /* Function vect_min_worthwhile_factor. 7551 7552 For a loop where we could vectorize the operation indicated by CODE, 7553 return the minimum vectorization factor that makes it worthwhile 7554 to use generic vectors. */ 7555 static unsigned int 7556 vect_min_worthwhile_factor (enum tree_code code) 7557 { 7558 switch (code) 7559 { 7560 case PLUS_EXPR: 7561 case MINUS_EXPR: 7562 case NEGATE_EXPR: 7563 return 4; 7564 7565 case BIT_AND_EXPR: 7566 case BIT_IOR_EXPR: 7567 case BIT_XOR_EXPR: 7568 case BIT_NOT_EXPR: 7569 return 2; 7570 7571 default: 7572 return INT_MAX; 7573 } 7574 } 7575 7576 /* Return true if VINFO indicates we are doing loop vectorization and if 7577 it is worth decomposing CODE operations into scalar operations for 7578 that loop's vectorization factor. */ 7579 7580 bool 7581 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code) 7582 { 7583 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo); 7584 unsigned HOST_WIDE_INT value; 7585 return (loop_vinfo 7586 && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value) 7587 && value >= vect_min_worthwhile_factor (code)); 7588 } 7589 7590 /* Function vectorizable_induction 7591 7592 Check if PHI performs an induction computation that can be vectorized. 7593 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized 7594 phi to replace it, put it in VEC_STMT, and add it to the same basic block. 7595 Return FALSE if not a vectorizable STMT, TRUE otherwise. */ 7596 7597 bool 7598 vectorizable_induction (gimple *phi, 7599 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED, 7600 gimple **vec_stmt, slp_tree slp_node) 7601 { 7602 stmt_vec_info stmt_info = vinfo_for_stmt (phi); 7603 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 7604 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 7605 unsigned ncopies; 7606 bool nested_in_vect_loop = false; 7607 struct loop *iv_loop; 7608 tree vec_def; 7609 edge pe = loop_preheader_edge (loop); 7610 basic_block new_bb; 7611 tree new_vec, vec_init, vec_step, t; 7612 tree new_name; 7613 gimple *new_stmt; 7614 gphi *induction_phi; 7615 tree induc_def, vec_dest; 7616 tree init_expr, step_expr; 7617 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 7618 unsigned i; 7619 tree expr; 7620 gimple_seq stmts; 7621 imm_use_iterator imm_iter; 7622 use_operand_p use_p; 7623 gimple *exit_phi; 7624 edge latch_e; 7625 tree loop_arg; 7626 gimple_stmt_iterator si; 7627 basic_block bb = gimple_bb (phi); 7628 7629 if (gimple_code (phi) != GIMPLE_PHI) 7630 return false; 7631 7632 if (!STMT_VINFO_RELEVANT_P (stmt_info)) 7633 return false; 7634 7635 /* Make sure it was recognized as induction computation. */ 7636 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def) 7637 return false; 7638 7639 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 7640 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype); 7641 7642 if (slp_node) 7643 ncopies = 1; 7644 else 7645 ncopies = vect_get_num_copies (loop_vinfo, vectype); 7646 gcc_assert (ncopies >= 1); 7647 7648 /* FORNOW. These restrictions should be relaxed. */ 7649 if (nested_in_vect_loop_p (loop, phi)) 7650 { 7651 imm_use_iterator imm_iter; 7652 use_operand_p use_p; 7653 gimple *exit_phi; 7654 edge latch_e; 7655 tree loop_arg; 7656 7657 if (ncopies > 1) 7658 { 7659 if (dump_enabled_p ()) 7660 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 7661 "multiple types in nested loop.\n"); 7662 return false; 7663 } 7664 7665 /* FORNOW: outer loop induction with SLP not supported. */ 7666 if (STMT_SLP_TYPE (stmt_info)) 7667 return false; 7668 7669 exit_phi = NULL; 7670 latch_e = loop_latch_edge (loop->inner); 7671 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e); 7672 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg) 7673 { 7674 gimple *use_stmt = USE_STMT (use_p); 7675 if (is_gimple_debug (use_stmt)) 7676 continue; 7677 7678 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt))) 7679 { 7680 exit_phi = use_stmt; 7681 break; 7682 } 7683 } 7684 if (exit_phi) 7685 { 7686 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi); 7687 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo) 7688 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))) 7689 { 7690 if (dump_enabled_p ()) 7691 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 7692 "inner-loop induction only used outside " 7693 "of the outer vectorized loop.\n"); 7694 return false; 7695 } 7696 } 7697 7698 nested_in_vect_loop = true; 7699 iv_loop = loop->inner; 7700 } 7701 else 7702 iv_loop = loop; 7703 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father); 7704 7705 if (slp_node && !nunits.is_constant ()) 7706 { 7707 /* The current SLP code creates the initial value element-by-element. */ 7708 if (dump_enabled_p ()) 7709 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 7710 "SLP induction not supported for variable-length" 7711 " vectors.\n"); 7712 return false; 7713 } 7714 7715 if (!vec_stmt) /* transformation not required. */ 7716 { 7717 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type; 7718 if (dump_enabled_p ()) 7719 dump_printf_loc (MSG_NOTE, vect_location, 7720 "=== vectorizable_induction ===\n"); 7721 vect_model_induction_cost (stmt_info, ncopies); 7722 return true; 7723 } 7724 7725 /* Transform. */ 7726 7727 /* Compute a vector variable, initialized with the first VF values of 7728 the induction variable. E.g., for an iv with IV_PHI='X' and 7729 evolution S, for a vector of 4 units, we want to compute: 7730 [X, X + S, X + 2*S, X + 3*S]. */ 7731 7732 if (dump_enabled_p ()) 7733 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n"); 7734 7735 latch_e = loop_latch_edge (iv_loop); 7736 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e); 7737 7738 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info); 7739 gcc_assert (step_expr != NULL_TREE); 7740 7741 pe = loop_preheader_edge (iv_loop); 7742 init_expr = PHI_ARG_DEF_FROM_EDGE (phi, 7743 loop_preheader_edge (iv_loop)); 7744 7745 stmts = NULL; 7746 if (!nested_in_vect_loop) 7747 { 7748 /* Convert the initial value to the desired type. */ 7749 tree new_type = TREE_TYPE (vectype); 7750 init_expr = gimple_convert (&stmts, new_type, init_expr); 7751 7752 /* If we are using the loop mask to "peel" for alignment then we need 7753 to adjust the start value here. */ 7754 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo); 7755 if (skip_niters != NULL_TREE) 7756 { 7757 if (FLOAT_TYPE_P (vectype)) 7758 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type, 7759 skip_niters); 7760 else 7761 skip_niters = gimple_convert (&stmts, new_type, skip_niters); 7762 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type, 7763 skip_niters, step_expr); 7764 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type, 7765 init_expr, skip_step); 7766 } 7767 } 7768 7769 /* Convert the step to the desired type. */ 7770 step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr); 7771 7772 if (stmts) 7773 { 7774 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); 7775 gcc_assert (!new_bb); 7776 } 7777 7778 /* Find the first insertion point in the BB. */ 7779 si = gsi_after_labels (bb); 7780 7781 /* For SLP induction we have to generate several IVs as for example 7782 with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S] 7783 [i + 2*S, i + 3*S, i + 3*S, i + 3*S]. The step is the same uniform 7784 [VF*S, VF*S, VF*S, VF*S] for all. */ 7785 if (slp_node) 7786 { 7787 /* Enforced above. */ 7788 unsigned int const_nunits = nunits.to_constant (); 7789 7790 /* Generate [VF*S, VF*S, ... ]. */ 7791 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))) 7792 { 7793 expr = build_int_cst (integer_type_node, vf); 7794 expr = fold_convert (TREE_TYPE (step_expr), expr); 7795 } 7796 else 7797 expr = build_int_cst (TREE_TYPE (step_expr), vf); 7798 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr), 7799 expr, step_expr); 7800 if (! CONSTANT_CLASS_P (new_name)) 7801 new_name = vect_init_vector (phi, new_name, 7802 TREE_TYPE (step_expr), NULL); 7803 new_vec = build_vector_from_val (vectype, new_name); 7804 vec_step = vect_init_vector (phi, new_vec, vectype, NULL); 7805 7806 /* Now generate the IVs. */ 7807 unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length (); 7808 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); 7809 unsigned elts = const_nunits * nvects; 7810 unsigned nivs = least_common_multiple (group_size, 7811 const_nunits) / const_nunits; 7812 gcc_assert (elts % group_size == 0); 7813 tree elt = init_expr; 7814 unsigned ivn; 7815 for (ivn = 0; ivn < nivs; ++ivn) 7816 { 7817 tree_vector_builder elts (vectype, const_nunits, 1); 7818 stmts = NULL; 7819 for (unsigned eltn = 0; eltn < const_nunits; ++eltn) 7820 { 7821 if (ivn*const_nunits + eltn >= group_size 7822 && (ivn * const_nunits + eltn) % group_size == 0) 7823 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt), 7824 elt, step_expr); 7825 elts.quick_push (elt); 7826 } 7827 vec_init = gimple_build_vector (&stmts, &elts); 7828 if (stmts) 7829 { 7830 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); 7831 gcc_assert (!new_bb); 7832 } 7833 7834 /* Create the induction-phi that defines the induction-operand. */ 7835 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_"); 7836 induction_phi = create_phi_node (vec_dest, iv_loop->header); 7837 set_vinfo_for_stmt (induction_phi, 7838 new_stmt_vec_info (induction_phi, loop_vinfo)); 7839 induc_def = PHI_RESULT (induction_phi); 7840 7841 /* Create the iv update inside the loop */ 7842 vec_def = make_ssa_name (vec_dest); 7843 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step); 7844 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT); 7845 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo)); 7846 7847 /* Set the arguments of the phi node: */ 7848 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION); 7849 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop), 7850 UNKNOWN_LOCATION); 7851 7852 SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi); 7853 } 7854 7855 /* Re-use IVs when we can. */ 7856 if (ivn < nvects) 7857 { 7858 unsigned vfp 7859 = least_common_multiple (group_size, const_nunits) / group_size; 7860 /* Generate [VF'*S, VF'*S, ... ]. */ 7861 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))) 7862 { 7863 expr = build_int_cst (integer_type_node, vfp); 7864 expr = fold_convert (TREE_TYPE (step_expr), expr); 7865 } 7866 else 7867 expr = build_int_cst (TREE_TYPE (step_expr), vfp); 7868 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr), 7869 expr, step_expr); 7870 if (! CONSTANT_CLASS_P (new_name)) 7871 new_name = vect_init_vector (phi, new_name, 7872 TREE_TYPE (step_expr), NULL); 7873 new_vec = build_vector_from_val (vectype, new_name); 7874 vec_step = vect_init_vector (phi, new_vec, vectype, NULL); 7875 for (; ivn < nvects; ++ivn) 7876 { 7877 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs]; 7878 tree def; 7879 if (gimple_code (iv) == GIMPLE_PHI) 7880 def = gimple_phi_result (iv); 7881 else 7882 def = gimple_assign_lhs (iv); 7883 new_stmt = gimple_build_assign (make_ssa_name (vectype), 7884 PLUS_EXPR, 7885 def, vec_step); 7886 if (gimple_code (iv) == GIMPLE_PHI) 7887 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT); 7888 else 7889 { 7890 gimple_stmt_iterator tgsi = gsi_for_stmt (iv); 7891 gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING); 7892 } 7893 set_vinfo_for_stmt (new_stmt, 7894 new_stmt_vec_info (new_stmt, loop_vinfo)); 7895 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt); 7896 } 7897 } 7898 7899 return true; 7900 } 7901 7902 /* Create the vector that holds the initial_value of the induction. */ 7903 if (nested_in_vect_loop) 7904 { 7905 /* iv_loop is nested in the loop to be vectorized. init_expr had already 7906 been created during vectorization of previous stmts. We obtain it 7907 from the STMT_VINFO_VEC_STMT of the defining stmt. */ 7908 vec_init = vect_get_vec_def_for_operand (init_expr, phi); 7909 /* If the initial value is not of proper type, convert it. */ 7910 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init))) 7911 { 7912 new_stmt 7913 = gimple_build_assign (vect_get_new_ssa_name (vectype, 7914 vect_simple_var, 7915 "vec_iv_"), 7916 VIEW_CONVERT_EXPR, 7917 build1 (VIEW_CONVERT_EXPR, vectype, 7918 vec_init)); 7919 vec_init = gimple_assign_lhs (new_stmt); 7920 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop), 7921 new_stmt); 7922 gcc_assert (!new_bb); 7923 set_vinfo_for_stmt (new_stmt, 7924 new_stmt_vec_info (new_stmt, loop_vinfo)); 7925 } 7926 } 7927 else 7928 { 7929 /* iv_loop is the loop to be vectorized. Create: 7930 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */ 7931 stmts = NULL; 7932 new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr); 7933 7934 unsigned HOST_WIDE_INT const_nunits; 7935 if (nunits.is_constant (&const_nunits)) 7936 { 7937 tree_vector_builder elts (vectype, const_nunits, 1); 7938 elts.quick_push (new_name); 7939 for (i = 1; i < const_nunits; i++) 7940 { 7941 /* Create: new_name_i = new_name + step_expr */ 7942 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name), 7943 new_name, step_expr); 7944 elts.quick_push (new_name); 7945 } 7946 /* Create a vector from [new_name_0, new_name_1, ..., 7947 new_name_nunits-1] */ 7948 vec_init = gimple_build_vector (&stmts, &elts); 7949 } 7950 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr))) 7951 /* Build the initial value directly from a VEC_SERIES_EXPR. */ 7952 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype, 7953 new_name, step_expr); 7954 else 7955 { 7956 /* Build: 7957 [base, base, base, ...] 7958 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */ 7959 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))); 7960 gcc_assert (flag_associative_math); 7961 tree index = build_index_vector (vectype, 0, 1); 7962 tree base_vec = gimple_build_vector_from_val (&stmts, vectype, 7963 new_name); 7964 tree step_vec = gimple_build_vector_from_val (&stmts, vectype, 7965 step_expr); 7966 vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index); 7967 vec_init = gimple_build (&stmts, MULT_EXPR, vectype, 7968 vec_init, step_vec); 7969 vec_init = gimple_build (&stmts, PLUS_EXPR, vectype, 7970 vec_init, base_vec); 7971 } 7972 7973 if (stmts) 7974 { 7975 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); 7976 gcc_assert (!new_bb); 7977 } 7978 } 7979 7980 7981 /* Create the vector that holds the step of the induction. */ 7982 if (nested_in_vect_loop) 7983 /* iv_loop is nested in the loop to be vectorized. Generate: 7984 vec_step = [S, S, S, S] */ 7985 new_name = step_expr; 7986 else 7987 { 7988 /* iv_loop is the loop to be vectorized. Generate: 7989 vec_step = [VF*S, VF*S, VF*S, VF*S] */ 7990 gimple_seq seq = NULL; 7991 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))) 7992 { 7993 expr = build_int_cst (integer_type_node, vf); 7994 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr); 7995 } 7996 else 7997 expr = build_int_cst (TREE_TYPE (step_expr), vf); 7998 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr), 7999 expr, step_expr); 8000 if (seq) 8001 { 8002 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq); 8003 gcc_assert (!new_bb); 8004 } 8005 } 8006 8007 t = unshare_expr (new_name); 8008 gcc_assert (CONSTANT_CLASS_P (new_name) 8009 || TREE_CODE (new_name) == SSA_NAME); 8010 new_vec = build_vector_from_val (vectype, t); 8011 vec_step = vect_init_vector (phi, new_vec, vectype, NULL); 8012 8013 8014 /* Create the following def-use cycle: 8015 loop prolog: 8016 vec_init = ... 8017 vec_step = ... 8018 loop: 8019 vec_iv = PHI <vec_init, vec_loop> 8020 ... 8021 STMT 8022 ... 8023 vec_loop = vec_iv + vec_step; */ 8024 8025 /* Create the induction-phi that defines the induction-operand. */ 8026 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_"); 8027 induction_phi = create_phi_node (vec_dest, iv_loop->header); 8028 set_vinfo_for_stmt (induction_phi, 8029 new_stmt_vec_info (induction_phi, loop_vinfo)); 8030 induc_def = PHI_RESULT (induction_phi); 8031 8032 /* Create the iv update inside the loop */ 8033 vec_def = make_ssa_name (vec_dest); 8034 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step); 8035 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT); 8036 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo)); 8037 8038 /* Set the arguments of the phi node: */ 8039 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION); 8040 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop), 8041 UNKNOWN_LOCATION); 8042 8043 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi; 8044 8045 /* In case that vectorization factor (VF) is bigger than the number 8046 of elements that we can fit in a vectype (nunits), we have to generate 8047 more than one vector stmt - i.e - we need to "unroll" the 8048 vector stmt by a factor VF/nunits. For more details see documentation 8049 in vectorizable_operation. */ 8050 8051 if (ncopies > 1) 8052 { 8053 gimple_seq seq = NULL; 8054 stmt_vec_info prev_stmt_vinfo; 8055 /* FORNOW. This restriction should be relaxed. */ 8056 gcc_assert (!nested_in_vect_loop); 8057 8058 /* Create the vector that holds the step of the induction. */ 8059 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))) 8060 { 8061 expr = build_int_cst (integer_type_node, nunits); 8062 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr); 8063 } 8064 else 8065 expr = build_int_cst (TREE_TYPE (step_expr), nunits); 8066 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr), 8067 expr, step_expr); 8068 if (seq) 8069 { 8070 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq); 8071 gcc_assert (!new_bb); 8072 } 8073 8074 t = unshare_expr (new_name); 8075 gcc_assert (CONSTANT_CLASS_P (new_name) 8076 || TREE_CODE (new_name) == SSA_NAME); 8077 new_vec = build_vector_from_val (vectype, t); 8078 vec_step = vect_init_vector (phi, new_vec, vectype, NULL); 8079 8080 vec_def = induc_def; 8081 prev_stmt_vinfo = vinfo_for_stmt (induction_phi); 8082 for (i = 1; i < ncopies; i++) 8083 { 8084 /* vec_i = vec_prev + vec_step */ 8085 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR, 8086 vec_def, vec_step); 8087 vec_def = make_ssa_name (vec_dest, new_stmt); 8088 gimple_assign_set_lhs (new_stmt, vec_def); 8089 8090 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT); 8091 set_vinfo_for_stmt (new_stmt, 8092 new_stmt_vec_info (new_stmt, loop_vinfo)); 8093 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt; 8094 prev_stmt_vinfo = vinfo_for_stmt (new_stmt); 8095 } 8096 } 8097 8098 if (nested_in_vect_loop) 8099 { 8100 /* Find the loop-closed exit-phi of the induction, and record 8101 the final vector of induction results: */ 8102 exit_phi = NULL; 8103 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg) 8104 { 8105 gimple *use_stmt = USE_STMT (use_p); 8106 if (is_gimple_debug (use_stmt)) 8107 continue; 8108 8109 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt))) 8110 { 8111 exit_phi = use_stmt; 8112 break; 8113 } 8114 } 8115 if (exit_phi) 8116 { 8117 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi); 8118 /* FORNOW. Currently not supporting the case that an inner-loop induction 8119 is not used in the outer-loop (i.e. only outside the outer-loop). */ 8120 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo) 8121 && !STMT_VINFO_LIVE_P (stmt_vinfo)); 8122 8123 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt; 8124 if (dump_enabled_p ()) 8125 { 8126 dump_printf_loc (MSG_NOTE, vect_location, 8127 "vector of inductions after inner-loop:"); 8128 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0); 8129 } 8130 } 8131 } 8132 8133 8134 if (dump_enabled_p ()) 8135 { 8136 dump_printf_loc (MSG_NOTE, vect_location, 8137 "transform induction: created def-use cycle: "); 8138 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0); 8139 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, 8140 SSA_NAME_DEF_STMT (vec_def), 0); 8141 } 8142 8143 return true; 8144 } 8145 8146 /* Function vectorizable_live_operation. 8147 8148 STMT computes a value that is used outside the loop. Check if 8149 it can be supported. */ 8150 8151 bool 8152 vectorizable_live_operation (gimple *stmt, 8153 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED, 8154 slp_tree slp_node, int slp_index, 8155 gimple **vec_stmt) 8156 { 8157 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 8158 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 8159 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 8160 imm_use_iterator imm_iter; 8161 tree lhs, lhs_type, bitsize, vec_bitsize; 8162 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 8163 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype); 8164 int ncopies; 8165 gimple *use_stmt; 8166 auto_vec<tree> vec_oprnds; 8167 int vec_entry = 0; 8168 poly_uint64 vec_index = 0; 8169 8170 gcc_assert (STMT_VINFO_LIVE_P (stmt_info)); 8171 8172 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def) 8173 return false; 8174 8175 /* FORNOW. CHECKME. */ 8176 if (nested_in_vect_loop_p (loop, stmt)) 8177 return false; 8178 8179 /* If STMT is not relevant and it is a simple assignment and its inputs are 8180 invariant then it can remain in place, unvectorized. The original last 8181 scalar value that it computes will be used. */ 8182 if (!STMT_VINFO_RELEVANT_P (stmt_info)) 8183 { 8184 gcc_assert (is_simple_and_all_uses_invariant (stmt, loop_vinfo)); 8185 if (dump_enabled_p ()) 8186 dump_printf_loc (MSG_NOTE, vect_location, 8187 "statement is simple and uses invariant. Leaving in " 8188 "place.\n"); 8189 return true; 8190 } 8191 8192 if (slp_node) 8193 ncopies = 1; 8194 else 8195 ncopies = vect_get_num_copies (loop_vinfo, vectype); 8196 8197 if (slp_node) 8198 { 8199 gcc_assert (slp_index >= 0); 8200 8201 int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length (); 8202 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); 8203 8204 /* Get the last occurrence of the scalar index from the concatenation of 8205 all the slp vectors. Calculate which slp vector it is and the index 8206 within. */ 8207 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index; 8208 8209 /* Calculate which vector contains the result, and which lane of 8210 that vector we need. */ 8211 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index)) 8212 { 8213 if (dump_enabled_p ()) 8214 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 8215 "Cannot determine which vector holds the" 8216 " final result.\n"); 8217 return false; 8218 } 8219 } 8220 8221 if (!vec_stmt) 8222 { 8223 /* No transformation required. */ 8224 if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)) 8225 { 8226 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype, 8227 OPTIMIZE_FOR_SPEED)) 8228 { 8229 if (dump_enabled_p ()) 8230 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 8231 "can't use a fully-masked loop because " 8232 "the target doesn't support extract last " 8233 "reduction.\n"); 8234 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false; 8235 } 8236 else if (slp_node) 8237 { 8238 if (dump_enabled_p ()) 8239 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 8240 "can't use a fully-masked loop because an " 8241 "SLP statement is live after the loop.\n"); 8242 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false; 8243 } 8244 else if (ncopies > 1) 8245 { 8246 if (dump_enabled_p ()) 8247 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 8248 "can't use a fully-masked loop because" 8249 " ncopies is greater than 1.\n"); 8250 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false; 8251 } 8252 else 8253 { 8254 gcc_assert (ncopies == 1 && !slp_node); 8255 vect_record_loop_mask (loop_vinfo, 8256 &LOOP_VINFO_MASKS (loop_vinfo), 8257 1, vectype); 8258 } 8259 } 8260 return true; 8261 } 8262 8263 /* If stmt has a related stmt, then use that for getting the lhs. */ 8264 if (is_pattern_stmt_p (stmt_info)) 8265 stmt = STMT_VINFO_RELATED_STMT (stmt_info); 8266 8267 lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt) 8268 : gimple_get_lhs (stmt); 8269 lhs_type = TREE_TYPE (lhs); 8270 8271 bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype) 8272 ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype))) 8273 : TYPE_SIZE (TREE_TYPE (vectype))); 8274 vec_bitsize = TYPE_SIZE (vectype); 8275 8276 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */ 8277 tree vec_lhs, bitstart; 8278 if (slp_node) 8279 { 8280 gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)); 8281 8282 /* Get the correct slp vectorized stmt. */ 8283 gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry]; 8284 if (gphi *phi = dyn_cast <gphi *> (vec_stmt)) 8285 vec_lhs = gimple_phi_result (phi); 8286 else 8287 vec_lhs = gimple_get_lhs (vec_stmt); 8288 8289 /* Get entry to use. */ 8290 bitstart = bitsize_int (vec_index); 8291 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart); 8292 } 8293 else 8294 { 8295 enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info); 8296 vec_lhs = vect_get_vec_def_for_operand_1 (stmt, dt); 8297 gcc_checking_assert (ncopies == 1 8298 || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)); 8299 8300 /* For multiple copies, get the last copy. */ 8301 for (int i = 1; i < ncopies; ++i) 8302 vec_lhs = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, 8303 vec_lhs); 8304 8305 /* Get the last lane in the vector. */ 8306 bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize); 8307 } 8308 8309 gimple_seq stmts = NULL; 8310 tree new_tree; 8311 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) 8312 { 8313 /* Emit: 8314 8315 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK> 8316 8317 where VEC_LHS is the vectorized live-out result and MASK is 8318 the loop mask for the final iteration. */ 8319 gcc_assert (ncopies == 1 && !slp_node); 8320 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info)); 8321 tree scalar_res = make_ssa_name (scalar_type); 8322 tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo), 8323 1, vectype, 0); 8324 gcall *new_stmt = gimple_build_call_internal (IFN_EXTRACT_LAST, 8325 2, mask, vec_lhs); 8326 gimple_call_set_lhs (new_stmt, scalar_res); 8327 gimple_seq_add_stmt (&stmts, new_stmt); 8328 8329 /* Convert the extracted vector element to the required scalar type. */ 8330 new_tree = gimple_convert (&stmts, lhs_type, scalar_res); 8331 } 8332 else 8333 { 8334 tree bftype = TREE_TYPE (vectype); 8335 if (VECTOR_BOOLEAN_TYPE_P (vectype)) 8336 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1); 8337 new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart); 8338 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree), 8339 &stmts, true, NULL_TREE); 8340 } 8341 8342 if (stmts) 8343 gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts); 8344 8345 /* Replace use of lhs with newly computed result. If the use stmt is a 8346 single arg PHI, just replace all uses of PHI result. It's necessary 8347 because lcssa PHI defining lhs may be before newly inserted stmt. */ 8348 use_operand_p use_p; 8349 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs) 8350 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)) 8351 && !is_gimple_debug (use_stmt)) 8352 { 8353 if (gimple_code (use_stmt) == GIMPLE_PHI 8354 && gimple_phi_num_args (use_stmt) == 1) 8355 { 8356 replace_uses_by (gimple_phi_result (use_stmt), new_tree); 8357 } 8358 else 8359 { 8360 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter) 8361 SET_USE (use_p, new_tree); 8362 } 8363 update_stmt (use_stmt); 8364 } 8365 8366 return true; 8367 } 8368 8369 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */ 8370 8371 static void 8372 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt) 8373 { 8374 ssa_op_iter op_iter; 8375 imm_use_iterator imm_iter; 8376 def_operand_p def_p; 8377 gimple *ustmt; 8378 8379 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF) 8380 { 8381 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p)) 8382 { 8383 basic_block bb; 8384 8385 if (!is_gimple_debug (ustmt)) 8386 continue; 8387 8388 bb = gimple_bb (ustmt); 8389 8390 if (!flow_bb_inside_loop_p (loop, bb)) 8391 { 8392 if (gimple_debug_bind_p (ustmt)) 8393 { 8394 if (dump_enabled_p ()) 8395 dump_printf_loc (MSG_NOTE, vect_location, 8396 "killing debug use\n"); 8397 8398 gimple_debug_bind_reset_value (ustmt); 8399 update_stmt (ustmt); 8400 } 8401 else 8402 gcc_unreachable (); 8403 } 8404 } 8405 } 8406 } 8407 8408 /* Given loop represented by LOOP_VINFO, return true if computation of 8409 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false 8410 otherwise. */ 8411 8412 static bool 8413 loop_niters_no_overflow (loop_vec_info loop_vinfo) 8414 { 8415 /* Constant case. */ 8416 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) 8417 { 8418 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo); 8419 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo); 8420 8421 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST); 8422 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST); 8423 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters)) 8424 return true; 8425 } 8426 8427 widest_int max; 8428 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 8429 /* Check the upper bound of loop niters. */ 8430 if (get_max_loop_iterations (loop, &max)) 8431 { 8432 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)); 8433 signop sgn = TYPE_SIGN (type); 8434 widest_int type_max = widest_int::from (wi::max_value (type), sgn); 8435 if (max < type_max) 8436 return true; 8437 } 8438 return false; 8439 } 8440 8441 /* Return a mask type with half the number of elements as TYPE. */ 8442 8443 tree 8444 vect_halve_mask_nunits (tree type) 8445 { 8446 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2); 8447 return build_truth_vector_type (nunits, current_vector_size); 8448 } 8449 8450 /* Return a mask type with twice as many elements as TYPE. */ 8451 8452 tree 8453 vect_double_mask_nunits (tree type) 8454 { 8455 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2; 8456 return build_truth_vector_type (nunits, current_vector_size); 8457 } 8458 8459 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to 8460 contain a sequence of NVECTORS masks that each control a vector of type 8461 VECTYPE. */ 8462 8463 void 8464 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks, 8465 unsigned int nvectors, tree vectype) 8466 { 8467 gcc_assert (nvectors != 0); 8468 if (masks->length () < nvectors) 8469 masks->safe_grow_cleared (nvectors); 8470 rgroup_masks *rgm = &(*masks)[nvectors - 1]; 8471 /* The number of scalars per iteration and the number of vectors are 8472 both compile-time constants. */ 8473 unsigned int nscalars_per_iter 8474 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype), 8475 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant (); 8476 if (rgm->max_nscalars_per_iter < nscalars_per_iter) 8477 { 8478 rgm->max_nscalars_per_iter = nscalars_per_iter; 8479 rgm->mask_type = build_same_sized_truth_vector_type (vectype); 8480 } 8481 } 8482 8483 /* Given a complete set of masks MASKS, extract mask number INDEX 8484 for an rgroup that operates on NVECTORS vectors of type VECTYPE, 8485 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI. 8486 8487 See the comment above vec_loop_masks for more details about the mask 8488 arrangement. */ 8489 8490 tree 8491 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks, 8492 unsigned int nvectors, tree vectype, unsigned int index) 8493 { 8494 rgroup_masks *rgm = &(*masks)[nvectors - 1]; 8495 tree mask_type = rgm->mask_type; 8496 8497 /* Populate the rgroup's mask array, if this is the first time we've 8498 used it. */ 8499 if (rgm->masks.is_empty ()) 8500 { 8501 rgm->masks.safe_grow_cleared (nvectors); 8502 for (unsigned int i = 0; i < nvectors; ++i) 8503 { 8504 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask"); 8505 /* Provide a dummy definition until the real one is available. */ 8506 SSA_NAME_DEF_STMT (mask) = gimple_build_nop (); 8507 rgm->masks[i] = mask; 8508 } 8509 } 8510 8511 tree mask = rgm->masks[index]; 8512 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type), 8513 TYPE_VECTOR_SUBPARTS (vectype))) 8514 { 8515 /* A loop mask for data type X can be reused for data type Y 8516 if X has N times more elements than Y and if Y's elements 8517 are N times bigger than X's. In this case each sequence 8518 of N elements in the loop mask will be all-zero or all-one. 8519 We can then view-convert the mask so that each sequence of 8520 N elements is replaced by a single element. */ 8521 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type), 8522 TYPE_VECTOR_SUBPARTS (vectype))); 8523 gimple_seq seq = NULL; 8524 mask_type = build_same_sized_truth_vector_type (vectype); 8525 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask); 8526 if (seq) 8527 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT); 8528 } 8529 return mask; 8530 } 8531 8532 /* Scale profiling counters by estimation for LOOP which is vectorized 8533 by factor VF. */ 8534 8535 static void 8536 scale_profile_for_vect_loop (struct loop *loop, unsigned vf) 8537 { 8538 edge preheader = loop_preheader_edge (loop); 8539 /* Reduce loop iterations by the vectorization factor. */ 8540 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf); 8541 profile_count freq_h = loop->header->count, freq_e = preheader->count (); 8542 8543 if (freq_h.nonzero_p ()) 8544 { 8545 profile_probability p; 8546 8547 /* Avoid dropping loop body profile counter to 0 because of zero count 8548 in loop's preheader. */ 8549 if (!(freq_e == profile_count::zero ())) 8550 freq_e = freq_e.force_nonzero (); 8551 p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h); 8552 scale_loop_frequencies (loop, p); 8553 } 8554 8555 edge exit_e = single_exit (loop); 8556 exit_e->probability = profile_probability::always () 8557 .apply_scale (1, new_est_niter + 1); 8558 8559 edge exit_l = single_pred_edge (loop->latch); 8560 profile_probability prob = exit_l->probability; 8561 exit_l->probability = exit_e->probability.invert (); 8562 if (prob.initialized_p () && exit_l->probability.initialized_p ()) 8563 scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob); 8564 } 8565 8566 /* Function vect_transform_loop. 8567 8568 The analysis phase has determined that the loop is vectorizable. 8569 Vectorize the loop - created vectorized stmts to replace the scalar 8570 stmts in the loop, and update the loop exit condition. 8571 Returns scalar epilogue loop if any. */ 8572 8573 struct loop * 8574 vect_transform_loop (loop_vec_info loop_vinfo) 8575 { 8576 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 8577 struct loop *epilogue = NULL; 8578 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); 8579 int nbbs = loop->num_nodes; 8580 int i; 8581 tree niters_vector = NULL_TREE; 8582 tree step_vector = NULL_TREE; 8583 tree niters_vector_mult_vf = NULL_TREE; 8584 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 8585 unsigned int lowest_vf = constant_lower_bound (vf); 8586 bool grouped_store; 8587 bool slp_scheduled = false; 8588 gimple *stmt, *pattern_stmt; 8589 gimple_seq pattern_def_seq = NULL; 8590 gimple_stmt_iterator pattern_def_si = gsi_none (); 8591 bool transform_pattern_stmt = false; 8592 bool check_profitability = false; 8593 unsigned int th; 8594 8595 if (dump_enabled_p ()) 8596 dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n"); 8597 8598 /* Use the more conservative vectorization threshold. If the number 8599 of iterations is constant assume the cost check has been performed 8600 by our caller. If the threshold makes all loops profitable that 8601 run at least the (estimated) vectorization factor number of times 8602 checking is pointless, too. */ 8603 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo); 8604 if (th >= vect_vf_for_cost (loop_vinfo) 8605 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) 8606 { 8607 if (dump_enabled_p ()) 8608 dump_printf_loc (MSG_NOTE, vect_location, 8609 "Profitability threshold is %d loop iterations.\n", 8610 th); 8611 check_profitability = true; 8612 } 8613 8614 /* Make sure there exists a single-predecessor exit bb. Do this before 8615 versioning. */ 8616 edge e = single_exit (loop); 8617 if (! single_pred_p (e->dest)) 8618 { 8619 split_loop_exit_edge (e); 8620 if (dump_enabled_p ()) 8621 dump_printf (MSG_NOTE, "split exit edge\n"); 8622 } 8623 8624 /* Version the loop first, if required, so the profitability check 8625 comes first. */ 8626 8627 if (LOOP_REQUIRES_VERSIONING (loop_vinfo)) 8628 { 8629 poly_uint64 versioning_threshold 8630 = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo); 8631 if (check_profitability 8632 && ordered_p (poly_uint64 (th), versioning_threshold)) 8633 { 8634 versioning_threshold = ordered_max (poly_uint64 (th), 8635 versioning_threshold); 8636 check_profitability = false; 8637 } 8638 vect_loop_versioning (loop_vinfo, th, check_profitability, 8639 versioning_threshold); 8640 check_profitability = false; 8641 } 8642 8643 /* Make sure there exists a single-predecessor exit bb also on the 8644 scalar loop copy. Do this after versioning but before peeling 8645 so CFG structure is fine for both scalar and if-converted loop 8646 to make slpeel_duplicate_current_defs_from_edges face matched 8647 loop closed PHI nodes on the exit. */ 8648 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)) 8649 { 8650 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)); 8651 if (! single_pred_p (e->dest)) 8652 { 8653 split_loop_exit_edge (e); 8654 if (dump_enabled_p ()) 8655 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n"); 8656 } 8657 } 8658 8659 tree niters = vect_build_loop_niters (loop_vinfo); 8660 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters; 8661 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo)); 8662 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo); 8663 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector, 8664 &step_vector, &niters_vector_mult_vf, th, 8665 check_profitability, niters_no_overflow); 8666 8667 if (niters_vector == NULL_TREE) 8668 { 8669 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) 8670 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) 8671 && known_eq (lowest_vf, vf)) 8672 { 8673 niters_vector 8674 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)), 8675 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf); 8676 step_vector = build_one_cst (TREE_TYPE (niters)); 8677 } 8678 else 8679 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector, 8680 &step_vector, niters_no_overflow); 8681 } 8682 8683 /* 1) Make sure the loop header has exactly two entries 8684 2) Make sure we have a preheader basic block. */ 8685 8686 gcc_assert (EDGE_COUNT (loop->header->preds) == 2); 8687 8688 split_edge (loop_preheader_edge (loop)); 8689 8690 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) 8691 && vect_use_loop_mask_for_alignment_p (loop_vinfo)) 8692 /* This will deal with any possible peeling. */ 8693 vect_prepare_for_masked_peels (loop_vinfo); 8694 8695 /* FORNOW: the vectorizer supports only loops which body consist 8696 of one basic block (header + empty latch). When the vectorizer will 8697 support more involved loop forms, the order by which the BBs are 8698 traversed need to be reconsidered. */ 8699 8700 for (i = 0; i < nbbs; i++) 8701 { 8702 basic_block bb = bbs[i]; 8703 stmt_vec_info stmt_info; 8704 8705 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si); 8706 gsi_next (&si)) 8707 { 8708 gphi *phi = si.phi (); 8709 if (dump_enabled_p ()) 8710 { 8711 dump_printf_loc (MSG_NOTE, vect_location, 8712 "------>vectorizing phi: "); 8713 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0); 8714 } 8715 stmt_info = vinfo_for_stmt (phi); 8716 if (!stmt_info) 8717 continue; 8718 8719 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info)) 8720 vect_loop_kill_debug_uses (loop, phi); 8721 8722 if (!STMT_VINFO_RELEVANT_P (stmt_info) 8723 && !STMT_VINFO_LIVE_P (stmt_info)) 8724 continue; 8725 8726 if (STMT_VINFO_VECTYPE (stmt_info) 8727 && (maybe_ne 8728 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf)) 8729 && dump_enabled_p ()) 8730 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n"); 8731 8732 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def 8733 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def 8734 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle) 8735 && ! PURE_SLP_STMT (stmt_info)) 8736 { 8737 if (dump_enabled_p ()) 8738 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n"); 8739 vect_transform_stmt (phi, NULL, NULL, NULL, NULL); 8740 } 8741 } 8742 8743 pattern_stmt = NULL; 8744 for (gimple_stmt_iterator si = gsi_start_bb (bb); 8745 !gsi_end_p (si) || transform_pattern_stmt;) 8746 { 8747 bool is_store; 8748 8749 if (transform_pattern_stmt) 8750 stmt = pattern_stmt; 8751 else 8752 { 8753 stmt = gsi_stmt (si); 8754 /* During vectorization remove existing clobber stmts. */ 8755 if (gimple_clobber_p (stmt)) 8756 { 8757 unlink_stmt_vdef (stmt); 8758 gsi_remove (&si, true); 8759 release_defs (stmt); 8760 continue; 8761 } 8762 } 8763 8764 if (dump_enabled_p ()) 8765 { 8766 dump_printf_loc (MSG_NOTE, vect_location, 8767 "------>vectorizing statement: "); 8768 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0); 8769 } 8770 8771 stmt_info = vinfo_for_stmt (stmt); 8772 8773 /* vector stmts created in the outer-loop during vectorization of 8774 stmts in an inner-loop may not have a stmt_info, and do not 8775 need to be vectorized. */ 8776 if (!stmt_info) 8777 { 8778 gsi_next (&si); 8779 continue; 8780 } 8781 8782 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info)) 8783 vect_loop_kill_debug_uses (loop, stmt); 8784 8785 if (!STMT_VINFO_RELEVANT_P (stmt_info) 8786 && !STMT_VINFO_LIVE_P (stmt_info)) 8787 { 8788 if (STMT_VINFO_IN_PATTERN_P (stmt_info) 8789 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info)) 8790 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt)) 8791 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt)))) 8792 { 8793 stmt = pattern_stmt; 8794 stmt_info = vinfo_for_stmt (stmt); 8795 } 8796 else 8797 { 8798 gsi_next (&si); 8799 continue; 8800 } 8801 } 8802 else if (STMT_VINFO_IN_PATTERN_P (stmt_info) 8803 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info)) 8804 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt)) 8805 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt)))) 8806 transform_pattern_stmt = true; 8807 8808 /* If pattern statement has def stmts, vectorize them too. */ 8809 if (is_pattern_stmt_p (stmt_info)) 8810 { 8811 if (pattern_def_seq == NULL) 8812 { 8813 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info); 8814 pattern_def_si = gsi_start (pattern_def_seq); 8815 } 8816 else if (!gsi_end_p (pattern_def_si)) 8817 gsi_next (&pattern_def_si); 8818 if (pattern_def_seq != NULL) 8819 { 8820 gimple *pattern_def_stmt = NULL; 8821 stmt_vec_info pattern_def_stmt_info = NULL; 8822 8823 while (!gsi_end_p (pattern_def_si)) 8824 { 8825 pattern_def_stmt = gsi_stmt (pattern_def_si); 8826 pattern_def_stmt_info 8827 = vinfo_for_stmt (pattern_def_stmt); 8828 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info) 8829 || STMT_VINFO_LIVE_P (pattern_def_stmt_info)) 8830 break; 8831 gsi_next (&pattern_def_si); 8832 } 8833 8834 if (!gsi_end_p (pattern_def_si)) 8835 { 8836 if (dump_enabled_p ()) 8837 { 8838 dump_printf_loc (MSG_NOTE, vect_location, 8839 "==> vectorizing pattern def " 8840 "stmt: "); 8841 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, 8842 pattern_def_stmt, 0); 8843 } 8844 8845 stmt = pattern_def_stmt; 8846 stmt_info = pattern_def_stmt_info; 8847 } 8848 else 8849 { 8850 pattern_def_si = gsi_none (); 8851 transform_pattern_stmt = false; 8852 } 8853 } 8854 else 8855 transform_pattern_stmt = false; 8856 } 8857 8858 if (STMT_VINFO_VECTYPE (stmt_info)) 8859 { 8860 poly_uint64 nunits 8861 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)); 8862 if (!STMT_SLP_TYPE (stmt_info) 8863 && maybe_ne (nunits, vf) 8864 && dump_enabled_p ()) 8865 /* For SLP VF is set according to unrolling factor, and not 8866 to vector size, hence for SLP this print is not valid. */ 8867 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n"); 8868 } 8869 8870 /* SLP. Schedule all the SLP instances when the first SLP stmt is 8871 reached. */ 8872 if (STMT_SLP_TYPE (stmt_info)) 8873 { 8874 if (!slp_scheduled) 8875 { 8876 slp_scheduled = true; 8877 8878 if (dump_enabled_p ()) 8879 dump_printf_loc (MSG_NOTE, vect_location, 8880 "=== scheduling SLP instances ===\n"); 8881 8882 vect_schedule_slp (loop_vinfo); 8883 } 8884 8885 /* Hybrid SLP stmts must be vectorized in addition to SLP. */ 8886 if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info)) 8887 { 8888 if (!transform_pattern_stmt && gsi_end_p (pattern_def_si)) 8889 { 8890 pattern_def_seq = NULL; 8891 gsi_next (&si); 8892 } 8893 continue; 8894 } 8895 } 8896 8897 /* -------- vectorize statement ------------ */ 8898 if (dump_enabled_p ()) 8899 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n"); 8900 8901 grouped_store = false; 8902 is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL); 8903 if (is_store) 8904 { 8905 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) 8906 { 8907 /* Interleaving. If IS_STORE is TRUE, the vectorization of the 8908 interleaving chain was completed - free all the stores in 8909 the chain. */ 8910 gsi_next (&si); 8911 vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info)); 8912 } 8913 else 8914 { 8915 /* Free the attached stmt_vec_info and remove the stmt. */ 8916 gimple *store = gsi_stmt (si); 8917 free_stmt_vec_info (store); 8918 unlink_stmt_vdef (store); 8919 gsi_remove (&si, true); 8920 release_defs (store); 8921 } 8922 8923 /* Stores can only appear at the end of pattern statements. */ 8924 gcc_assert (!transform_pattern_stmt); 8925 pattern_def_seq = NULL; 8926 } 8927 else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si)) 8928 { 8929 pattern_def_seq = NULL; 8930 gsi_next (&si); 8931 } 8932 } /* stmts in BB */ 8933 8934 /* Stub out scalar statements that must not survive vectorization. 8935 Doing this here helps with grouped statements, or statements that 8936 are involved in patterns. */ 8937 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); 8938 !gsi_end_p (gsi); gsi_next (&gsi)) 8939 { 8940 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi)); 8941 if (call && gimple_call_internal_p (call, IFN_MASK_LOAD)) 8942 { 8943 tree lhs = gimple_get_lhs (call); 8944 if (!VECTOR_TYPE_P (TREE_TYPE (lhs))) 8945 { 8946 tree zero = build_zero_cst (TREE_TYPE (lhs)); 8947 gimple *new_stmt = gimple_build_assign (lhs, zero); 8948 gsi_replace (&gsi, new_stmt, true); 8949 } 8950 } 8951 } 8952 } /* BBs in loop */ 8953 8954 /* The vectorization factor is always > 1, so if we use an IV increment of 1. 8955 a zero NITERS becomes a nonzero NITERS_VECTOR. */ 8956 if (integer_onep (step_vector)) 8957 niters_no_overflow = true; 8958 vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector, 8959 niters_vector_mult_vf, !niters_no_overflow); 8960 8961 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo); 8962 scale_profile_for_vect_loop (loop, assumed_vf); 8963 8964 /* True if the final iteration might not handle a full vector's 8965 worth of scalar iterations. */ 8966 bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo); 8967 /* The minimum number of iterations performed by the epilogue. This 8968 is 1 when peeling for gaps because we always need a final scalar 8969 iteration. */ 8970 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0; 8971 /* +1 to convert latch counts to loop iteration counts, 8972 -min_epilogue_iters to remove iterations that cannot be performed 8973 by the vector code. */ 8974 int bias_for_lowest = 1 - min_epilogue_iters; 8975 int bias_for_assumed = bias_for_lowest; 8976 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); 8977 if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) 8978 { 8979 /* When the amount of peeling is known at compile time, the first 8980 iteration will have exactly alignment_npeels active elements. 8981 In the worst case it will have at least one. */ 8982 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1); 8983 bias_for_lowest += lowest_vf - min_first_active; 8984 bias_for_assumed += assumed_vf - min_first_active; 8985 } 8986 /* In these calculations the "- 1" converts loop iteration counts 8987 back to latch counts. */ 8988 if (loop->any_upper_bound) 8989 loop->nb_iterations_upper_bound 8990 = (final_iter_may_be_partial 8991 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest, 8992 lowest_vf) - 1 8993 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest, 8994 lowest_vf) - 1); 8995 if (loop->any_likely_upper_bound) 8996 loop->nb_iterations_likely_upper_bound 8997 = (final_iter_may_be_partial 8998 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound 8999 + bias_for_lowest, lowest_vf) - 1 9000 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound 9001 + bias_for_lowest, lowest_vf) - 1); 9002 if (loop->any_estimate) 9003 loop->nb_iterations_estimate 9004 = (final_iter_may_be_partial 9005 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed, 9006 assumed_vf) - 1 9007 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed, 9008 assumed_vf) - 1); 9009 9010 if (dump_enabled_p ()) 9011 { 9012 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)) 9013 { 9014 dump_printf_loc (MSG_NOTE, vect_location, 9015 "LOOP VECTORIZED\n"); 9016 if (loop->inner) 9017 dump_printf_loc (MSG_NOTE, vect_location, 9018 "OUTER LOOP VECTORIZED\n"); 9019 dump_printf (MSG_NOTE, "\n"); 9020 } 9021 else 9022 { 9023 dump_printf_loc (MSG_NOTE, vect_location, 9024 "LOOP EPILOGUE VECTORIZED (VS="); 9025 dump_dec (MSG_NOTE, current_vector_size); 9026 dump_printf (MSG_NOTE, ")\n"); 9027 } 9028 } 9029 9030 /* Free SLP instances here because otherwise stmt reference counting 9031 won't work. */ 9032 slp_instance instance; 9033 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance) 9034 vect_free_slp_instance (instance); 9035 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release (); 9036 /* Clear-up safelen field since its value is invalid after vectorization 9037 since vectorized loop can have loop-carried dependencies. */ 9038 loop->safelen = 0; 9039 9040 /* Don't vectorize epilogue for epilogue. */ 9041 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)) 9042 epilogue = NULL; 9043 9044 if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK)) 9045 epilogue = NULL; 9046 9047 if (epilogue) 9048 { 9049 auto_vector_sizes vector_sizes; 9050 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes); 9051 unsigned int next_size = 0; 9052 9053 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) 9054 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0 9055 && known_eq (vf, lowest_vf)) 9056 { 9057 unsigned int eiters 9058 = (LOOP_VINFO_INT_NITERS (loop_vinfo) 9059 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)); 9060 eiters = eiters % lowest_vf; 9061 epilogue->nb_iterations_upper_bound = eiters - 1; 9062 9063 unsigned int ratio; 9064 while (next_size < vector_sizes.length () 9065 && !(constant_multiple_p (current_vector_size, 9066 vector_sizes[next_size], &ratio) 9067 && eiters >= lowest_vf / ratio)) 9068 next_size += 1; 9069 } 9070 else 9071 while (next_size < vector_sizes.length () 9072 && maybe_lt (current_vector_size, vector_sizes[next_size])) 9073 next_size += 1; 9074 9075 if (next_size == vector_sizes.length ()) 9076 epilogue = NULL; 9077 } 9078 9079 if (epilogue) 9080 { 9081 epilogue->force_vectorize = loop->force_vectorize; 9082 epilogue->safelen = loop->safelen; 9083 epilogue->dont_vectorize = false; 9084 9085 /* We may need to if-convert epilogue to vectorize it. */ 9086 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)) 9087 tree_if_conversion (epilogue); 9088 } 9089 9090 return epilogue; 9091 } 9092 9093 /* The code below is trying to perform simple optimization - revert 9094 if-conversion for masked stores, i.e. if the mask of a store is zero 9095 do not perform it and all stored value producers also if possible. 9096 For example, 9097 for (i=0; i<n; i++) 9098 if (c[i]) 9099 { 9100 p1[i] += 1; 9101 p2[i] = p3[i] +2; 9102 } 9103 this transformation will produce the following semi-hammock: 9104 9105 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 }) 9106 { 9107 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165); 9108 vect__12.22_172 = vect__11.19_170 + vect_cst__171; 9109 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172); 9110 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165); 9111 vect__19.28_184 = vect__18.25_182 + vect_cst__183; 9112 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184); 9113 } 9114 */ 9115 9116 void 9117 optimize_mask_stores (struct loop *loop) 9118 { 9119 basic_block *bbs = get_loop_body (loop); 9120 unsigned nbbs = loop->num_nodes; 9121 unsigned i; 9122 basic_block bb; 9123 struct loop *bb_loop; 9124 gimple_stmt_iterator gsi; 9125 gimple *stmt; 9126 auto_vec<gimple *> worklist; 9127 9128 vect_location = find_loop_location (loop); 9129 /* Pick up all masked stores in loop if any. */ 9130 for (i = 0; i < nbbs; i++) 9131 { 9132 bb = bbs[i]; 9133 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); 9134 gsi_next (&gsi)) 9135 { 9136 stmt = gsi_stmt (gsi); 9137 if (gimple_call_internal_p (stmt, IFN_MASK_STORE)) 9138 worklist.safe_push (stmt); 9139 } 9140 } 9141 9142 free (bbs); 9143 if (worklist.is_empty ()) 9144 return; 9145 9146 /* Loop has masked stores. */ 9147 while (!worklist.is_empty ()) 9148 { 9149 gimple *last, *last_store; 9150 edge e, efalse; 9151 tree mask; 9152 basic_block store_bb, join_bb; 9153 gimple_stmt_iterator gsi_to; 9154 tree vdef, new_vdef; 9155 gphi *phi; 9156 tree vectype; 9157 tree zero; 9158 9159 last = worklist.pop (); 9160 mask = gimple_call_arg (last, 2); 9161 bb = gimple_bb (last); 9162 /* Create then_bb and if-then structure in CFG, then_bb belongs to 9163 the same loop as if_bb. It could be different to LOOP when two 9164 level loop-nest is vectorized and mask_store belongs to the inner 9165 one. */ 9166 e = split_block (bb, last); 9167 bb_loop = bb->loop_father; 9168 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop)); 9169 join_bb = e->dest; 9170 store_bb = create_empty_bb (bb); 9171 add_bb_to_loop (store_bb, bb_loop); 9172 e->flags = EDGE_TRUE_VALUE; 9173 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE); 9174 /* Put STORE_BB to likely part. */ 9175 efalse->probability = profile_probability::unlikely (); 9176 store_bb->count = efalse->count (); 9177 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU); 9178 if (dom_info_available_p (CDI_DOMINATORS)) 9179 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb); 9180 if (dump_enabled_p ()) 9181 dump_printf_loc (MSG_NOTE, vect_location, 9182 "Create new block %d to sink mask stores.", 9183 store_bb->index); 9184 /* Create vector comparison with boolean result. */ 9185 vectype = TREE_TYPE (mask); 9186 zero = build_zero_cst (vectype); 9187 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE); 9188 gsi = gsi_last_bb (bb); 9189 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT); 9190 /* Create new PHI node for vdef of the last masked store: 9191 .MEM_2 = VDEF <.MEM_1> 9192 will be converted to 9193 .MEM.3 = VDEF <.MEM_1> 9194 and new PHI node will be created in join bb 9195 .MEM_2 = PHI <.MEM_1, .MEM_3> 9196 */ 9197 vdef = gimple_vdef (last); 9198 new_vdef = make_ssa_name (gimple_vop (cfun), last); 9199 gimple_set_vdef (last, new_vdef); 9200 phi = create_phi_node (vdef, join_bb); 9201 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION); 9202 9203 /* Put all masked stores with the same mask to STORE_BB if possible. */ 9204 while (true) 9205 { 9206 gimple_stmt_iterator gsi_from; 9207 gimple *stmt1 = NULL; 9208 9209 /* Move masked store to STORE_BB. */ 9210 last_store = last; 9211 gsi = gsi_for_stmt (last); 9212 gsi_from = gsi; 9213 /* Shift GSI to the previous stmt for further traversal. */ 9214 gsi_prev (&gsi); 9215 gsi_to = gsi_start_bb (store_bb); 9216 gsi_move_before (&gsi_from, &gsi_to); 9217 /* Setup GSI_TO to the non-empty block start. */ 9218 gsi_to = gsi_start_bb (store_bb); 9219 if (dump_enabled_p ()) 9220 { 9221 dump_printf_loc (MSG_NOTE, vect_location, 9222 "Move stmt to created bb\n"); 9223 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0); 9224 } 9225 /* Move all stored value producers if possible. */ 9226 while (!gsi_end_p (gsi)) 9227 { 9228 tree lhs; 9229 imm_use_iterator imm_iter; 9230 use_operand_p use_p; 9231 bool res; 9232 9233 /* Skip debug statements. */ 9234 if (is_gimple_debug (gsi_stmt (gsi))) 9235 { 9236 gsi_prev (&gsi); 9237 continue; 9238 } 9239 stmt1 = gsi_stmt (gsi); 9240 /* Do not consider statements writing to memory or having 9241 volatile operand. */ 9242 if (gimple_vdef (stmt1) 9243 || gimple_has_volatile_ops (stmt1)) 9244 break; 9245 gsi_from = gsi; 9246 gsi_prev (&gsi); 9247 lhs = gimple_get_lhs (stmt1); 9248 if (!lhs) 9249 break; 9250 9251 /* LHS of vectorized stmt must be SSA_NAME. */ 9252 if (TREE_CODE (lhs) != SSA_NAME) 9253 break; 9254 9255 if (!VECTOR_TYPE_P (TREE_TYPE (lhs))) 9256 { 9257 /* Remove dead scalar statement. */ 9258 if (has_zero_uses (lhs)) 9259 { 9260 gsi_remove (&gsi_from, true); 9261 continue; 9262 } 9263 } 9264 9265 /* Check that LHS does not have uses outside of STORE_BB. */ 9266 res = true; 9267 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs) 9268 { 9269 gimple *use_stmt; 9270 use_stmt = USE_STMT (use_p); 9271 if (is_gimple_debug (use_stmt)) 9272 continue; 9273 if (gimple_bb (use_stmt) != store_bb) 9274 { 9275 res = false; 9276 break; 9277 } 9278 } 9279 if (!res) 9280 break; 9281 9282 if (gimple_vuse (stmt1) 9283 && gimple_vuse (stmt1) != gimple_vuse (last_store)) 9284 break; 9285 9286 /* Can move STMT1 to STORE_BB. */ 9287 if (dump_enabled_p ()) 9288 { 9289 dump_printf_loc (MSG_NOTE, vect_location, 9290 "Move stmt to created bb\n"); 9291 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0); 9292 } 9293 gsi_move_before (&gsi_from, &gsi_to); 9294 /* Shift GSI_TO for further insertion. */ 9295 gsi_prev (&gsi_to); 9296 } 9297 /* Put other masked stores with the same mask to STORE_BB. */ 9298 if (worklist.is_empty () 9299 || gimple_call_arg (worklist.last (), 2) != mask 9300 || worklist.last () != stmt1) 9301 break; 9302 last = worklist.pop (); 9303 } 9304 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION); 9305 } 9306 } 9307