1 /* Loop Vectorization 2 Copyright (C) 2003-2022 Free Software Foundation, Inc. 3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and 4 Ira Rosen <irar@il.ibm.com> 5 6 This file is part of GCC. 7 8 GCC is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free 10 Software Foundation; either version 3, or (at your option) any later 11 version. 12 13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or 15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 16 for more details. 17 18 You should have received a copy of the GNU General Public License 19 along with GCC; see the file COPYING3. If not see 20 <http://www.gnu.org/licenses/>. */ 21 22 #define INCLUDE_ALGORITHM 23 #include "config.h" 24 #include "system.h" 25 #include "coretypes.h" 26 #include "backend.h" 27 #include "target.h" 28 #include "rtl.h" 29 #include "tree.h" 30 #include "gimple.h" 31 #include "cfghooks.h" 32 #include "tree-pass.h" 33 #include "ssa.h" 34 #include "optabs-tree.h" 35 #include "diagnostic-core.h" 36 #include "fold-const.h" 37 #include "stor-layout.h" 38 #include "cfganal.h" 39 #include "gimplify.h" 40 #include "gimple-iterator.h" 41 #include "gimplify-me.h" 42 #include "tree-ssa-loop-ivopts.h" 43 #include "tree-ssa-loop-manip.h" 44 #include "tree-ssa-loop-niter.h" 45 #include "tree-ssa-loop.h" 46 #include "cfgloop.h" 47 #include "tree-scalar-evolution.h" 48 #include "tree-vectorizer.h" 49 #include "gimple-fold.h" 50 #include "cgraph.h" 51 #include "tree-cfg.h" 52 #include "tree-if-conv.h" 53 #include "internal-fn.h" 54 #include "tree-vector-builder.h" 55 #include "vec-perm-indices.h" 56 #include "tree-eh.h" 57 #include "case-cfn-macros.h" 58 59 /* Loop Vectorization Pass. 60 61 This pass tries to vectorize loops. 62 63 For example, the vectorizer transforms the following simple loop: 64 65 short a[N]; short b[N]; short c[N]; int i; 66 67 for (i=0; i<N; i++){ 68 a[i] = b[i] + c[i]; 69 } 70 71 as if it was manually vectorized by rewriting the source code into: 72 73 typedef int __attribute__((mode(V8HI))) v8hi; 74 short a[N]; short b[N]; short c[N]; int i; 75 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c; 76 v8hi va, vb, vc; 77 78 for (i=0; i<N/8; i++){ 79 vb = pb[i]; 80 vc = pc[i]; 81 va = vb + vc; 82 pa[i] = va; 83 } 84 85 The main entry to this pass is vectorize_loops(), in which 86 the vectorizer applies a set of analyses on a given set of loops, 87 followed by the actual vectorization transformation for the loops that 88 had successfully passed the analysis phase. 89 Throughout this pass we make a distinction between two types of 90 data: scalars (which are represented by SSA_NAMES), and memory references 91 ("data-refs"). These two types of data require different handling both 92 during analysis and transformation. The types of data-refs that the 93 vectorizer currently supports are ARRAY_REFS which base is an array DECL 94 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer 95 accesses are required to have a simple (consecutive) access pattern. 96 97 Analysis phase: 98 =============== 99 The driver for the analysis phase is vect_analyze_loop(). 100 It applies a set of analyses, some of which rely on the scalar evolution 101 analyzer (scev) developed by Sebastian Pop. 102 103 During the analysis phase the vectorizer records some information 104 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the 105 loop, as well as general information about the loop as a whole, which is 106 recorded in a "loop_vec_info" struct attached to each loop. 107 108 Transformation phase: 109 ===================== 110 The loop transformation phase scans all the stmts in the loop, and 111 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in 112 the loop that needs to be vectorized. It inserts the vector code sequence 113 just before the scalar stmt S, and records a pointer to the vector code 114 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct 115 attached to S). This pointer will be used for the vectorization of following 116 stmts which use the def of stmt S. Stmt S is removed if it writes to memory; 117 otherwise, we rely on dead code elimination for removing it. 118 119 For example, say stmt S1 was vectorized into stmt VS1: 120 121 VS1: vb = px[i]; 122 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1 123 S2: a = b; 124 125 To vectorize stmt S2, the vectorizer first finds the stmt that defines 126 the operand 'b' (S1), and gets the relevant vector def 'vb' from the 127 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The 128 resulting sequence would be: 129 130 VS1: vb = px[i]; 131 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1 132 VS2: va = vb; 133 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2 134 135 Operands that are not SSA_NAMEs, are data-refs that appear in 136 load/store operations (like 'x[i]' in S1), and are handled differently. 137 138 Target modeling: 139 ================= 140 Currently the only target specific information that is used is the 141 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". 142 Targets that can support different sizes of vectors, for now will need 143 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More 144 flexibility will be added in the future. 145 146 Since we only vectorize operations which vector form can be 147 expressed using existing tree codes, to verify that an operation is 148 supported, the vectorizer checks the relevant optab at the relevant 149 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If 150 the value found is CODE_FOR_nothing, then there's no target support, and 151 we can't vectorize the stmt. 152 153 For additional information on this project see: 154 http://gcc.gnu.org/projects/tree-ssa/vectorization.html 155 */ 156 157 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *, 158 unsigned *); 159 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info, 160 bool *, bool *); 161 162 /* Subroutine of vect_determine_vf_for_stmt that handles only one 163 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE 164 may already be set for general statements (not just data refs). */ 165 166 static opt_result 167 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info, 168 bool vectype_maybe_set_p, 169 poly_uint64 *vf) 170 { 171 gimple *stmt = stmt_info->stmt; 172 173 if ((!STMT_VINFO_RELEVANT_P (stmt_info) 174 && !STMT_VINFO_LIVE_P (stmt_info)) 175 || gimple_clobber_p (stmt)) 176 { 177 if (dump_enabled_p ()) 178 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n"); 179 return opt_result::success (); 180 } 181 182 tree stmt_vectype, nunits_vectype; 183 opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info, 184 &stmt_vectype, 185 &nunits_vectype); 186 if (!res) 187 return res; 188 189 if (stmt_vectype) 190 { 191 if (STMT_VINFO_VECTYPE (stmt_info)) 192 /* The only case when a vectype had been already set is for stmts 193 that contain a data ref, or for "pattern-stmts" (stmts generated 194 by the vectorizer to represent/replace a certain idiom). */ 195 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info) 196 || vectype_maybe_set_p) 197 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype); 198 else 199 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype; 200 } 201 202 if (nunits_vectype) 203 vect_update_max_nunits (vf, nunits_vectype); 204 205 return opt_result::success (); 206 } 207 208 /* Subroutine of vect_determine_vectorization_factor. Set the vector 209 types of STMT_INFO and all attached pattern statements and update 210 the vectorization factor VF accordingly. Return true on success 211 or false if something prevented vectorization. */ 212 213 static opt_result 214 vect_determine_vf_for_stmt (vec_info *vinfo, 215 stmt_vec_info stmt_info, poly_uint64 *vf) 216 { 217 if (dump_enabled_p ()) 218 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G", 219 stmt_info->stmt); 220 opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf); 221 if (!res) 222 return res; 223 224 if (STMT_VINFO_IN_PATTERN_P (stmt_info) 225 && STMT_VINFO_RELATED_STMT (stmt_info)) 226 { 227 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info); 228 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info); 229 230 /* If a pattern statement has def stmts, analyze them too. */ 231 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq); 232 !gsi_end_p (si); gsi_next (&si)) 233 { 234 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si)); 235 if (dump_enabled_p ()) 236 dump_printf_loc (MSG_NOTE, vect_location, 237 "==> examining pattern def stmt: %G", 238 def_stmt_info->stmt); 239 res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf); 240 if (!res) 241 return res; 242 } 243 244 if (dump_enabled_p ()) 245 dump_printf_loc (MSG_NOTE, vect_location, 246 "==> examining pattern statement: %G", 247 stmt_info->stmt); 248 res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf); 249 if (!res) 250 return res; 251 } 252 253 return opt_result::success (); 254 } 255 256 /* Function vect_determine_vectorization_factor 257 258 Determine the vectorization factor (VF). VF is the number of data elements 259 that are operated upon in parallel in a single iteration of the vectorized 260 loop. For example, when vectorizing a loop that operates on 4byte elements, 261 on a target with vector size (VS) 16byte, the VF is set to 4, since 4 262 elements can fit in a single vector register. 263 264 We currently support vectorization of loops in which all types operated upon 265 are of the same size. Therefore this function currently sets VF according to 266 the size of the types operated upon, and fails if there are multiple sizes 267 in the loop. 268 269 VF is also the factor by which the loop iterations are strip-mined, e.g.: 270 original loop: 271 for (i=0; i<N; i++){ 272 a[i] = b[i] + c[i]; 273 } 274 275 vectorized loop: 276 for (i=0; i<N; i+=VF){ 277 a[i:VF] = b[i:VF] + c[i:VF]; 278 } 279 */ 280 281 static opt_result 282 vect_determine_vectorization_factor (loop_vec_info loop_vinfo) 283 { 284 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 285 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); 286 unsigned nbbs = loop->num_nodes; 287 poly_uint64 vectorization_factor = 1; 288 tree scalar_type = NULL_TREE; 289 gphi *phi; 290 tree vectype; 291 stmt_vec_info stmt_info; 292 unsigned i; 293 294 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor"); 295 296 for (i = 0; i < nbbs; i++) 297 { 298 basic_block bb = bbs[i]; 299 300 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si); 301 gsi_next (&si)) 302 { 303 phi = si.phi (); 304 stmt_info = loop_vinfo->lookup_stmt (phi); 305 if (dump_enabled_p ()) 306 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G", 307 phi); 308 309 gcc_assert (stmt_info); 310 311 if (STMT_VINFO_RELEVANT_P (stmt_info) 312 || STMT_VINFO_LIVE_P (stmt_info)) 313 { 314 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info)); 315 scalar_type = TREE_TYPE (PHI_RESULT (phi)); 316 317 if (dump_enabled_p ()) 318 dump_printf_loc (MSG_NOTE, vect_location, 319 "get vectype for scalar type: %T\n", 320 scalar_type); 321 322 vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type); 323 if (!vectype) 324 return opt_result::failure_at (phi, 325 "not vectorized: unsupported " 326 "data-type %T\n", 327 scalar_type); 328 STMT_VINFO_VECTYPE (stmt_info) = vectype; 329 330 if (dump_enabled_p ()) 331 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n", 332 vectype); 333 334 if (dump_enabled_p ()) 335 { 336 dump_printf_loc (MSG_NOTE, vect_location, "nunits = "); 337 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype)); 338 dump_printf (MSG_NOTE, "\n"); 339 } 340 341 vect_update_max_nunits (&vectorization_factor, vectype); 342 } 343 } 344 345 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si); 346 gsi_next (&si)) 347 { 348 if (is_gimple_debug (gsi_stmt (si))) 349 continue; 350 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si)); 351 opt_result res 352 = vect_determine_vf_for_stmt (loop_vinfo, 353 stmt_info, &vectorization_factor); 354 if (!res) 355 return res; 356 } 357 } 358 359 /* TODO: Analyze cost. Decide if worth while to vectorize. */ 360 if (dump_enabled_p ()) 361 { 362 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = "); 363 dump_dec (MSG_NOTE, vectorization_factor); 364 dump_printf (MSG_NOTE, "\n"); 365 } 366 367 if (known_le (vectorization_factor, 1U)) 368 return opt_result::failure_at (vect_location, 369 "not vectorized: unsupported data-type\n"); 370 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor; 371 return opt_result::success (); 372 } 373 374 375 /* Function vect_is_simple_iv_evolution. 376 377 FORNOW: A simple evolution of an induction variables in the loop is 378 considered a polynomial evolution. */ 379 380 static bool 381 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init, 382 tree * step) 383 { 384 tree init_expr; 385 tree step_expr; 386 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb); 387 basic_block bb; 388 389 /* When there is no evolution in this loop, the evolution function 390 is not "simple". */ 391 if (evolution_part == NULL_TREE) 392 return false; 393 394 /* When the evolution is a polynomial of degree >= 2 395 the evolution function is not "simple". */ 396 if (tree_is_chrec (evolution_part)) 397 return false; 398 399 step_expr = evolution_part; 400 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb)); 401 402 if (dump_enabled_p ()) 403 dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n", 404 step_expr, init_expr); 405 406 *init = init_expr; 407 *step = step_expr; 408 409 if (TREE_CODE (step_expr) != INTEGER_CST 410 && (TREE_CODE (step_expr) != SSA_NAME 411 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr))) 412 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb)) 413 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr)) 414 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)) 415 || !flag_associative_math))) 416 && (TREE_CODE (step_expr) != REAL_CST 417 || !flag_associative_math)) 418 { 419 if (dump_enabled_p ()) 420 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 421 "step unknown.\n"); 422 return false; 423 } 424 425 return true; 426 } 427 428 /* Return true if PHI, described by STMT_INFO, is the inner PHI in 429 what we are assuming is a double reduction. For example, given 430 a structure like this: 431 432 outer1: 433 x_1 = PHI <x_4(outer2), ...>; 434 ... 435 436 inner: 437 x_2 = PHI <x_1(outer1), ...>; 438 ... 439 x_3 = ...; 440 ... 441 442 outer2: 443 x_4 = PHI <x_3(inner)>; 444 ... 445 446 outer loop analysis would treat x_1 as a double reduction phi and 447 this function would then return true for x_2. */ 448 449 static bool 450 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi) 451 { 452 use_operand_p use_p; 453 ssa_op_iter op_iter; 454 FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE) 455 if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p))) 456 if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def) 457 return true; 458 return false; 459 } 460 461 /* Function vect_analyze_scalar_cycles_1. 462 463 Examine the cross iteration def-use cycles of scalar variables 464 in LOOP. LOOP_VINFO represents the loop that is now being 465 considered for vectorization (can be LOOP, or an outer-loop 466 enclosing LOOP). */ 467 468 static void 469 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop) 470 { 471 basic_block bb = loop->header; 472 tree init, step; 473 auto_vec<stmt_vec_info, 64> worklist; 474 gphi_iterator gsi; 475 bool double_reduc, reduc_chain; 476 477 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles"); 478 479 /* First - identify all inductions. Reduction detection assumes that all the 480 inductions have been identified, therefore, this order must not be 481 changed. */ 482 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi)) 483 { 484 gphi *phi = gsi.phi (); 485 tree access_fn = NULL; 486 tree def = PHI_RESULT (phi); 487 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi); 488 489 if (dump_enabled_p ()) 490 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi); 491 492 /* Skip virtual phi's. The data dependences that are associated with 493 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */ 494 if (virtual_operand_p (def)) 495 continue; 496 497 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type; 498 499 /* Analyze the evolution function. */ 500 access_fn = analyze_scalar_evolution (loop, def); 501 if (access_fn) 502 { 503 STRIP_NOPS (access_fn); 504 if (dump_enabled_p ()) 505 dump_printf_loc (MSG_NOTE, vect_location, 506 "Access function of PHI: %T\n", access_fn); 507 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo) 508 = initial_condition_in_loop_num (access_fn, loop->num); 509 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) 510 = evolution_part_in_loop_num (access_fn, loop->num); 511 } 512 513 if (!access_fn 514 || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi) 515 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step) 516 || (LOOP_VINFO_LOOP (loop_vinfo) != loop 517 && TREE_CODE (step) != INTEGER_CST)) 518 { 519 worklist.safe_push (stmt_vinfo); 520 continue; 521 } 522 523 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo) 524 != NULL_TREE); 525 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE); 526 527 if (dump_enabled_p ()) 528 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n"); 529 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def; 530 } 531 532 533 /* Second - identify all reductions and nested cycles. */ 534 while (worklist.length () > 0) 535 { 536 stmt_vec_info stmt_vinfo = worklist.pop (); 537 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt); 538 tree def = PHI_RESULT (phi); 539 540 if (dump_enabled_p ()) 541 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi); 542 543 gcc_assert (!virtual_operand_p (def) 544 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type); 545 546 stmt_vec_info reduc_stmt_info 547 = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc, 548 &reduc_chain); 549 if (reduc_stmt_info) 550 { 551 STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info; 552 STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo; 553 if (double_reduc) 554 { 555 if (dump_enabled_p ()) 556 dump_printf_loc (MSG_NOTE, vect_location, 557 "Detected double reduction.\n"); 558 559 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def; 560 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def; 561 } 562 else 563 { 564 if (loop != LOOP_VINFO_LOOP (loop_vinfo)) 565 { 566 if (dump_enabled_p ()) 567 dump_printf_loc (MSG_NOTE, vect_location, 568 "Detected vectorizable nested cycle.\n"); 569 570 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle; 571 } 572 else 573 { 574 if (dump_enabled_p ()) 575 dump_printf_loc (MSG_NOTE, vect_location, 576 "Detected reduction.\n"); 577 578 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def; 579 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def; 580 /* Store the reduction cycles for possible vectorization in 581 loop-aware SLP if it was not detected as reduction 582 chain. */ 583 if (! reduc_chain) 584 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push 585 (reduc_stmt_info); 586 } 587 } 588 } 589 else 590 if (dump_enabled_p ()) 591 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 592 "Unknown def-use cycle pattern.\n"); 593 } 594 } 595 596 597 /* Function vect_analyze_scalar_cycles. 598 599 Examine the cross iteration def-use cycles of scalar variables, by 600 analyzing the loop-header PHIs of scalar variables. Classify each 601 cycle as one of the following: invariant, induction, reduction, unknown. 602 We do that for the loop represented by LOOP_VINFO, and also to its 603 inner-loop, if exists. 604 Examples for scalar cycles: 605 606 Example1: reduction: 607 608 loop1: 609 for (i=0; i<N; i++) 610 sum += a[i]; 611 612 Example2: induction: 613 614 loop2: 615 for (i=0; i<N; i++) 616 a[i] = i; */ 617 618 static void 619 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo) 620 { 621 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 622 623 vect_analyze_scalar_cycles_1 (loop_vinfo, loop); 624 625 /* When vectorizing an outer-loop, the inner-loop is executed sequentially. 626 Reductions in such inner-loop therefore have different properties than 627 the reductions in the nest that gets vectorized: 628 1. When vectorized, they are executed in the same order as in the original 629 scalar loop, so we can't change the order of computation when 630 vectorizing them. 631 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the 632 current checks are too strict. */ 633 634 if (loop->inner) 635 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner); 636 } 637 638 /* Transfer group and reduction information from STMT_INFO to its 639 pattern stmt. */ 640 641 static void 642 vect_fixup_reduc_chain (stmt_vec_info stmt_info) 643 { 644 stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info); 645 stmt_vec_info stmtp; 646 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp) 647 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)); 648 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info); 649 do 650 { 651 stmtp = STMT_VINFO_RELATED_STMT (stmt_info); 652 gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp) 653 == STMT_VINFO_DEF_TYPE (stmt_info)); 654 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp; 655 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info); 656 if (stmt_info) 657 REDUC_GROUP_NEXT_ELEMENT (stmtp) 658 = STMT_VINFO_RELATED_STMT (stmt_info); 659 } 660 while (stmt_info); 661 } 662 663 /* Fixup scalar cycles that now have their stmts detected as patterns. */ 664 665 static void 666 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo) 667 { 668 stmt_vec_info first; 669 unsigned i; 670 671 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first) 672 { 673 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first); 674 while (next) 675 { 676 if ((STMT_VINFO_IN_PATTERN_P (next) 677 != STMT_VINFO_IN_PATTERN_P (first)) 678 || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1) 679 break; 680 next = REDUC_GROUP_NEXT_ELEMENT (next); 681 } 682 /* If all reduction chain members are well-formed patterns adjust 683 the group to group the pattern stmts instead. */ 684 if (! next 685 && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1) 686 { 687 if (STMT_VINFO_IN_PATTERN_P (first)) 688 { 689 vect_fixup_reduc_chain (first); 690 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i] 691 = STMT_VINFO_RELATED_STMT (first); 692 } 693 } 694 /* If not all stmt in the chain are patterns or if we failed 695 to update STMT_VINFO_REDUC_IDX dissolve the chain and handle 696 it as regular reduction instead. */ 697 else 698 { 699 stmt_vec_info vinfo = first; 700 stmt_vec_info last = NULL; 701 while (vinfo) 702 { 703 next = REDUC_GROUP_NEXT_ELEMENT (vinfo); 704 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL; 705 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL; 706 last = vinfo; 707 vinfo = next; 708 } 709 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first)) 710 = vect_internal_def; 711 loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last)); 712 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i); 713 --i; 714 } 715 } 716 } 717 718 /* Function vect_get_loop_niters. 719 720 Determine how many iterations the loop is executed and place it 721 in NUMBER_OF_ITERATIONS. Place the number of latch iterations 722 in NUMBER_OF_ITERATIONSM1. Place the condition under which the 723 niter information holds in ASSUMPTIONS. 724 725 Return the loop exit condition. */ 726 727 728 static gcond * 729 vect_get_loop_niters (class loop *loop, tree *assumptions, 730 tree *number_of_iterations, tree *number_of_iterationsm1) 731 { 732 edge exit = single_exit (loop); 733 class tree_niter_desc niter_desc; 734 tree niter_assumptions, niter, may_be_zero; 735 gcond *cond = get_loop_exit_condition (loop); 736 737 *assumptions = boolean_true_node; 738 *number_of_iterationsm1 = chrec_dont_know; 739 *number_of_iterations = chrec_dont_know; 740 DUMP_VECT_SCOPE ("get_loop_niters"); 741 742 if (!exit) 743 return cond; 744 745 may_be_zero = NULL_TREE; 746 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL) 747 || chrec_contains_undetermined (niter_desc.niter)) 748 return cond; 749 750 niter_assumptions = niter_desc.assumptions; 751 may_be_zero = niter_desc.may_be_zero; 752 niter = niter_desc.niter; 753 754 if (may_be_zero && integer_zerop (may_be_zero)) 755 may_be_zero = NULL_TREE; 756 757 if (may_be_zero) 758 { 759 if (COMPARISON_CLASS_P (may_be_zero)) 760 { 761 /* Try to combine may_be_zero with assumptions, this can simplify 762 computation of niter expression. */ 763 if (niter_assumptions && !integer_nonzerop (niter_assumptions)) 764 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node, 765 niter_assumptions, 766 fold_build1 (TRUTH_NOT_EXPR, 767 boolean_type_node, 768 may_be_zero)); 769 else 770 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero, 771 build_int_cst (TREE_TYPE (niter), 0), 772 rewrite_to_non_trapping_overflow (niter)); 773 774 may_be_zero = NULL_TREE; 775 } 776 else if (integer_nonzerop (may_be_zero)) 777 { 778 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0); 779 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1); 780 return cond; 781 } 782 else 783 return cond; 784 } 785 786 *assumptions = niter_assumptions; 787 *number_of_iterationsm1 = niter; 788 789 /* We want the number of loop header executions which is the number 790 of latch executions plus one. 791 ??? For UINT_MAX latch executions this number overflows to zero 792 for loops like do { n++; } while (n != 0); */ 793 if (niter && !chrec_contains_undetermined (niter)) 794 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter), 795 build_int_cst (TREE_TYPE (niter), 1)); 796 *number_of_iterations = niter; 797 798 return cond; 799 } 800 801 /* Function bb_in_loop_p 802 803 Used as predicate for dfs order traversal of the loop bbs. */ 804 805 static bool 806 bb_in_loop_p (const_basic_block bb, const void *data) 807 { 808 const class loop *const loop = (const class loop *)data; 809 if (flow_bb_inside_loop_p (loop, bb)) 810 return true; 811 return false; 812 } 813 814 815 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as 816 stmt_vec_info structs for all the stmts in LOOP_IN. */ 817 818 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared) 819 : vec_info (vec_info::loop, shared), 820 loop (loop_in), 821 bbs (XCNEWVEC (basic_block, loop->num_nodes)), 822 num_itersm1 (NULL_TREE), 823 num_iters (NULL_TREE), 824 num_iters_unchanged (NULL_TREE), 825 num_iters_assumptions (NULL_TREE), 826 vector_costs (nullptr), 827 scalar_costs (nullptr), 828 th (0), 829 versioning_threshold (0), 830 vectorization_factor (0), 831 main_loop_edge (nullptr), 832 skip_main_loop_edge (nullptr), 833 skip_this_loop_edge (nullptr), 834 reusable_accumulators (), 835 suggested_unroll_factor (1), 836 max_vectorization_factor (0), 837 mask_skip_niters (NULL_TREE), 838 rgroup_compare_type (NULL_TREE), 839 simd_if_cond (NULL_TREE), 840 unaligned_dr (NULL), 841 peeling_for_alignment (0), 842 ptr_mask (0), 843 ivexpr_map (NULL), 844 scan_map (NULL), 845 slp_unrolling_factor (1), 846 inner_loop_cost_factor (param_vect_inner_loop_cost_factor), 847 vectorizable (false), 848 can_use_partial_vectors_p (param_vect_partial_vector_usage != 0), 849 using_partial_vectors_p (false), 850 epil_using_partial_vectors_p (false), 851 partial_load_store_bias (0), 852 peeling_for_gaps (false), 853 peeling_for_niter (false), 854 no_data_dependencies (false), 855 has_mask_store (false), 856 scalar_loop_scaling (profile_probability::uninitialized ()), 857 scalar_loop (NULL), 858 orig_loop_info (NULL) 859 { 860 /* CHECKME: We want to visit all BBs before their successors (except for 861 latch blocks, for which this assertion wouldn't hold). In the simple 862 case of the loop forms we allow, a dfs order of the BBs would the same 863 as reversed postorder traversal, so we are safe. */ 864 865 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p, 866 bbs, loop->num_nodes, loop); 867 gcc_assert (nbbs == loop->num_nodes); 868 869 for (unsigned int i = 0; i < nbbs; i++) 870 { 871 basic_block bb = bbs[i]; 872 gimple_stmt_iterator si; 873 874 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si)) 875 { 876 gimple *phi = gsi_stmt (si); 877 gimple_set_uid (phi, 0); 878 add_stmt (phi); 879 } 880 881 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si)) 882 { 883 gimple *stmt = gsi_stmt (si); 884 gimple_set_uid (stmt, 0); 885 if (is_gimple_debug (stmt)) 886 continue; 887 add_stmt (stmt); 888 /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the 889 third argument is the #pragma omp simd if (x) condition, when 0, 890 loop shouldn't be vectorized, when non-zero constant, it should 891 be vectorized normally, otherwise versioned with vectorized loop 892 done if the condition is non-zero at runtime. */ 893 if (loop_in->simduid 894 && is_gimple_call (stmt) 895 && gimple_call_internal_p (stmt) 896 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE 897 && gimple_call_num_args (stmt) >= 3 898 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME 899 && (loop_in->simduid 900 == SSA_NAME_VAR (gimple_call_arg (stmt, 0)))) 901 { 902 tree arg = gimple_call_arg (stmt, 2); 903 if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME) 904 simd_if_cond = arg; 905 else 906 gcc_assert (integer_nonzerop (arg)); 907 } 908 } 909 } 910 911 epilogue_vinfos.create (6); 912 } 913 914 /* Free all levels of rgroup CONTROLS. */ 915 916 void 917 release_vec_loop_controls (vec<rgroup_controls> *controls) 918 { 919 rgroup_controls *rgc; 920 unsigned int i; 921 FOR_EACH_VEC_ELT (*controls, i, rgc) 922 rgc->controls.release (); 923 controls->release (); 924 } 925 926 /* Free all memory used by the _loop_vec_info, as well as all the 927 stmt_vec_info structs of all the stmts in the loop. */ 928 929 _loop_vec_info::~_loop_vec_info () 930 { 931 free (bbs); 932 933 release_vec_loop_controls (&masks); 934 release_vec_loop_controls (&lens); 935 delete ivexpr_map; 936 delete scan_map; 937 epilogue_vinfos.release (); 938 delete scalar_costs; 939 delete vector_costs; 940 941 /* When we release an epiloge vinfo that we do not intend to use 942 avoid clearing AUX of the main loop which should continue to 943 point to the main loop vinfo since otherwise we'll leak that. */ 944 if (loop->aux == this) 945 loop->aux = NULL; 946 } 947 948 /* Return an invariant or register for EXPR and emit necessary 949 computations in the LOOP_VINFO loop preheader. */ 950 951 tree 952 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr) 953 { 954 if (is_gimple_reg (expr) 955 || is_gimple_min_invariant (expr)) 956 return expr; 957 958 if (! loop_vinfo->ivexpr_map) 959 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>; 960 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr); 961 if (! cached) 962 { 963 gimple_seq stmts = NULL; 964 cached = force_gimple_operand (unshare_expr (expr), 965 &stmts, true, NULL_TREE); 966 if (stmts) 967 { 968 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo)); 969 gsi_insert_seq_on_edge_immediate (e, stmts); 970 } 971 } 972 return cached; 973 } 974 975 /* Return true if we can use CMP_TYPE as the comparison type to produce 976 all masks required to mask LOOP_VINFO. */ 977 978 static bool 979 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type) 980 { 981 rgroup_controls *rgm; 982 unsigned int i; 983 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm) 984 if (rgm->type != NULL_TREE 985 && !direct_internal_fn_supported_p (IFN_WHILE_ULT, 986 cmp_type, rgm->type, 987 OPTIMIZE_FOR_SPEED)) 988 return false; 989 return true; 990 } 991 992 /* Calculate the maximum number of scalars per iteration for every 993 rgroup in LOOP_VINFO. */ 994 995 static unsigned int 996 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo) 997 { 998 unsigned int res = 1; 999 unsigned int i; 1000 rgroup_controls *rgm; 1001 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm) 1002 res = MAX (res, rgm->max_nscalars_per_iter); 1003 return res; 1004 } 1005 1006 /* Calculate the minimum precision necessary to represent: 1007 1008 MAX_NITERS * FACTOR 1009 1010 as an unsigned integer, where MAX_NITERS is the maximum number of 1011 loop header iterations for the original scalar form of LOOP_VINFO. */ 1012 1013 static unsigned 1014 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor) 1015 { 1016 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 1017 1018 /* Get the maximum number of iterations that is representable 1019 in the counter type. */ 1020 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo)); 1021 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1; 1022 1023 /* Get a more refined estimate for the number of iterations. */ 1024 widest_int max_back_edges; 1025 if (max_loop_iterations (loop, &max_back_edges)) 1026 max_ni = wi::smin (max_ni, max_back_edges + 1); 1027 1028 /* Work out how many bits we need to represent the limit. */ 1029 return wi::min_precision (max_ni * factor, UNSIGNED); 1030 } 1031 1032 /* True if the loop needs peeling or partial vectors when vectorized. */ 1033 1034 static bool 1035 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo) 1036 { 1037 unsigned HOST_WIDE_INT const_vf; 1038 HOST_WIDE_INT max_niter 1039 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo)); 1040 1041 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo); 1042 if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) 1043 th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO 1044 (loop_vinfo)); 1045 1046 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) 1047 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0) 1048 { 1049 /* Work out the (constant) number of iterations that need to be 1050 peeled for reasons other than niters. */ 1051 unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); 1052 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)) 1053 peel_niter += 1; 1054 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter, 1055 LOOP_VINFO_VECT_FACTOR (loop_vinfo))) 1056 return true; 1057 } 1058 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) 1059 /* ??? When peeling for gaps but not alignment, we could 1060 try to check whether the (variable) niters is known to be 1061 VF * N + 1. That's something of a niche case though. */ 1062 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) 1063 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf) 1064 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo)) 1065 < (unsigned) exact_log2 (const_vf)) 1066 /* In case of versioning, check if the maximum number of 1067 iterations is greater than th. If they are identical, 1068 the epilogue is unnecessary. */ 1069 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo) 1070 || ((unsigned HOST_WIDE_INT) max_niter 1071 > (th / const_vf) * const_vf)))) 1072 return true; 1073 1074 return false; 1075 } 1076 1077 /* Each statement in LOOP_VINFO can be masked where necessary. Check 1078 whether we can actually generate the masks required. Return true if so, 1079 storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE. */ 1080 1081 static bool 1082 vect_verify_full_masking (loop_vec_info loop_vinfo) 1083 { 1084 unsigned int min_ni_width; 1085 unsigned int max_nscalars_per_iter 1086 = vect_get_max_nscalars_per_iter (loop_vinfo); 1087 1088 /* Use a normal loop if there are no statements that need masking. 1089 This only happens in rare degenerate cases: it means that the loop 1090 has no loads, no stores, and no live-out values. */ 1091 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ()) 1092 return false; 1093 1094 /* Work out how many bits we need to represent the limit. */ 1095 min_ni_width 1096 = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter); 1097 1098 /* Find a scalar mode for which WHILE_ULT is supported. */ 1099 opt_scalar_int_mode cmp_mode_iter; 1100 tree cmp_type = NULL_TREE; 1101 tree iv_type = NULL_TREE; 1102 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo); 1103 unsigned int iv_precision = UINT_MAX; 1104 1105 if (iv_limit != -1) 1106 iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter, 1107 UNSIGNED); 1108 1109 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT) 1110 { 1111 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ()); 1112 if (cmp_bits >= min_ni_width 1113 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ())) 1114 { 1115 tree this_type = build_nonstandard_integer_type (cmp_bits, true); 1116 if (this_type 1117 && can_produce_all_loop_masks_p (loop_vinfo, this_type)) 1118 { 1119 /* Although we could stop as soon as we find a valid mode, 1120 there are at least two reasons why that's not always the 1121 best choice: 1122 1123 - An IV that's Pmode or wider is more likely to be reusable 1124 in address calculations than an IV that's narrower than 1125 Pmode. 1126 1127 - Doing the comparison in IV_PRECISION or wider allows 1128 a natural 0-based IV, whereas using a narrower comparison 1129 type requires mitigations against wrap-around. 1130 1131 Conversely, if the IV limit is variable, doing the comparison 1132 in a wider type than the original type can introduce 1133 unnecessary extensions, so picking the widest valid mode 1134 is not always a good choice either. 1135 1136 Here we prefer the first IV type that's Pmode or wider, 1137 and the first comparison type that's IV_PRECISION or wider. 1138 (The comparison type must be no wider than the IV type, 1139 to avoid extensions in the vector loop.) 1140 1141 ??? We might want to try continuing beyond Pmode for ILP32 1142 targets if CMP_BITS < IV_PRECISION. */ 1143 iv_type = this_type; 1144 if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type)) 1145 cmp_type = this_type; 1146 if (cmp_bits >= GET_MODE_BITSIZE (Pmode)) 1147 break; 1148 } 1149 } 1150 } 1151 1152 if (!cmp_type) 1153 return false; 1154 1155 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type; 1156 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type; 1157 return true; 1158 } 1159 1160 /* Check whether we can use vector access with length based on precison 1161 comparison. So far, to keep it simple, we only allow the case that the 1162 precision of the target supported length is larger than the precision 1163 required by loop niters. */ 1164 1165 static bool 1166 vect_verify_loop_lens (loop_vec_info loop_vinfo) 1167 { 1168 if (LOOP_VINFO_LENS (loop_vinfo).is_empty ()) 1169 return false; 1170 1171 machine_mode len_load_mode = get_len_load_store_mode 1172 (loop_vinfo->vector_mode, true).require (); 1173 machine_mode len_store_mode = get_len_load_store_mode 1174 (loop_vinfo->vector_mode, false).require (); 1175 1176 signed char partial_load_bias = internal_len_load_store_bias 1177 (IFN_LEN_LOAD, len_load_mode); 1178 1179 signed char partial_store_bias = internal_len_load_store_bias 1180 (IFN_LEN_STORE, len_store_mode); 1181 1182 gcc_assert (partial_load_bias == partial_store_bias); 1183 1184 if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED) 1185 return false; 1186 1187 /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit 1188 len_loads with a length of zero. In order to avoid that we prohibit 1189 more than one loop length here. */ 1190 if (partial_load_bias == -1 1191 && LOOP_VINFO_LENS (loop_vinfo).length () > 1) 1192 return false; 1193 1194 LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias; 1195 1196 unsigned int max_nitems_per_iter = 1; 1197 unsigned int i; 1198 rgroup_controls *rgl; 1199 /* Find the maximum number of items per iteration for every rgroup. */ 1200 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl) 1201 { 1202 unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor; 1203 max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter); 1204 } 1205 1206 /* Work out how many bits we need to represent the length limit. */ 1207 unsigned int min_ni_prec 1208 = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter); 1209 1210 /* Now use the maximum of below precisions for one suitable IV type: 1211 - the IV's natural precision 1212 - the precision needed to hold: the maximum number of scalar 1213 iterations multiplied by the scale factor (min_ni_prec above) 1214 - the Pmode precision 1215 1216 If min_ni_prec is less than the precision of the current niters, 1217 we perfer to still use the niters type. Prefer to use Pmode and 1218 wider IV to avoid narrow conversions. */ 1219 1220 unsigned int ni_prec 1221 = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo))); 1222 min_ni_prec = MAX (min_ni_prec, ni_prec); 1223 min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode)); 1224 1225 tree iv_type = NULL_TREE; 1226 opt_scalar_int_mode tmode_iter; 1227 FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT) 1228 { 1229 scalar_mode tmode = tmode_iter.require (); 1230 unsigned int tbits = GET_MODE_BITSIZE (tmode); 1231 1232 /* ??? Do we really want to construct one IV whose precision exceeds 1233 BITS_PER_WORD? */ 1234 if (tbits > BITS_PER_WORD) 1235 break; 1236 1237 /* Find the first available standard integral type. */ 1238 if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode)) 1239 { 1240 iv_type = build_nonstandard_integer_type (tbits, true); 1241 break; 1242 } 1243 } 1244 1245 if (!iv_type) 1246 { 1247 if (dump_enabled_p ()) 1248 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1249 "can't vectorize with length-based partial vectors" 1250 " because there is no suitable iv type.\n"); 1251 return false; 1252 } 1253 1254 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type; 1255 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type; 1256 1257 return true; 1258 } 1259 1260 /* Calculate the cost of one scalar iteration of the loop. */ 1261 static void 1262 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo) 1263 { 1264 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 1265 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); 1266 int nbbs = loop->num_nodes, factor; 1267 int innerloop_iters, i; 1268 1269 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost"); 1270 1271 /* Gather costs for statements in the scalar loop. */ 1272 1273 /* FORNOW. */ 1274 innerloop_iters = 1; 1275 if (loop->inner) 1276 innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo); 1277 1278 for (i = 0; i < nbbs; i++) 1279 { 1280 gimple_stmt_iterator si; 1281 basic_block bb = bbs[i]; 1282 1283 if (bb->loop_father == loop->inner) 1284 factor = innerloop_iters; 1285 else 1286 factor = 1; 1287 1288 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si)) 1289 { 1290 gimple *stmt = gsi_stmt (si); 1291 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt); 1292 1293 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt)) 1294 continue; 1295 1296 /* Skip stmts that are not vectorized inside the loop. */ 1297 stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info); 1298 if (!STMT_VINFO_RELEVANT_P (vstmt_info) 1299 && (!STMT_VINFO_LIVE_P (vstmt_info) 1300 || !VECTORIZABLE_CYCLE_DEF 1301 (STMT_VINFO_DEF_TYPE (vstmt_info)))) 1302 continue; 1303 1304 vect_cost_for_stmt kind; 1305 if (STMT_VINFO_DATA_REF (stmt_info)) 1306 { 1307 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))) 1308 kind = scalar_load; 1309 else 1310 kind = scalar_store; 1311 } 1312 else if (vect_nop_conversion_p (stmt_info)) 1313 continue; 1314 else 1315 kind = scalar_stmt; 1316 1317 /* We are using vect_prologue here to avoid scaling twice 1318 by the inner loop factor. */ 1319 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), 1320 factor, kind, stmt_info, 0, vect_prologue); 1321 } 1322 } 1323 1324 /* Now accumulate cost. */ 1325 loop_vinfo->scalar_costs = init_cost (loop_vinfo, true); 1326 add_stmt_costs (loop_vinfo->scalar_costs, 1327 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo)); 1328 loop_vinfo->scalar_costs->finish_cost (nullptr); 1329 } 1330 1331 1332 /* Function vect_analyze_loop_form. 1333 1334 Verify that certain CFG restrictions hold, including: 1335 - the loop has a pre-header 1336 - the loop has a single entry and exit 1337 - the loop exit condition is simple enough 1338 - the number of iterations can be analyzed, i.e, a countable loop. The 1339 niter could be analyzed under some assumptions. */ 1340 1341 opt_result 1342 vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info) 1343 { 1344 DUMP_VECT_SCOPE ("vect_analyze_loop_form"); 1345 1346 /* Different restrictions apply when we are considering an inner-most loop, 1347 vs. an outer (nested) loop. 1348 (FORNOW. May want to relax some of these restrictions in the future). */ 1349 1350 info->inner_loop_cond = NULL; 1351 if (!loop->inner) 1352 { 1353 /* Inner-most loop. We currently require that the number of BBs is 1354 exactly 2 (the header and latch). Vectorizable inner-most loops 1355 look like this: 1356 1357 (pre-header) 1358 | 1359 header <--------+ 1360 | | | 1361 | +--> latch --+ 1362 | 1363 (exit-bb) */ 1364 1365 if (loop->num_nodes != 2) 1366 return opt_result::failure_at (vect_location, 1367 "not vectorized:" 1368 " control flow in loop.\n"); 1369 1370 if (empty_block_p (loop->header)) 1371 return opt_result::failure_at (vect_location, 1372 "not vectorized: empty loop.\n"); 1373 } 1374 else 1375 { 1376 class loop *innerloop = loop->inner; 1377 edge entryedge; 1378 1379 /* Nested loop. We currently require that the loop is doubly-nested, 1380 contains a single inner loop, and the number of BBs is exactly 5. 1381 Vectorizable outer-loops look like this: 1382 1383 (pre-header) 1384 | 1385 header <---+ 1386 | | 1387 inner-loop | 1388 | | 1389 tail ------+ 1390 | 1391 (exit-bb) 1392 1393 The inner-loop has the properties expected of inner-most loops 1394 as described above. */ 1395 1396 if ((loop->inner)->inner || (loop->inner)->next) 1397 return opt_result::failure_at (vect_location, 1398 "not vectorized:" 1399 " multiple nested loops.\n"); 1400 1401 if (loop->num_nodes != 5) 1402 return opt_result::failure_at (vect_location, 1403 "not vectorized:" 1404 " control flow in loop.\n"); 1405 1406 entryedge = loop_preheader_edge (innerloop); 1407 if (entryedge->src != loop->header 1408 || !single_exit (innerloop) 1409 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src) 1410 return opt_result::failure_at (vect_location, 1411 "not vectorized:" 1412 " unsupported outerloop form.\n"); 1413 1414 /* Analyze the inner-loop. */ 1415 vect_loop_form_info inner; 1416 opt_result res = vect_analyze_loop_form (loop->inner, &inner); 1417 if (!res) 1418 { 1419 if (dump_enabled_p ()) 1420 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1421 "not vectorized: Bad inner loop.\n"); 1422 return res; 1423 } 1424 1425 /* Don't support analyzing niter under assumptions for inner 1426 loop. */ 1427 if (!integer_onep (inner.assumptions)) 1428 return opt_result::failure_at (vect_location, 1429 "not vectorized: Bad inner loop.\n"); 1430 1431 if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations)) 1432 return opt_result::failure_at (vect_location, 1433 "not vectorized: inner-loop count not" 1434 " invariant.\n"); 1435 1436 if (dump_enabled_p ()) 1437 dump_printf_loc (MSG_NOTE, vect_location, 1438 "Considering outer-loop vectorization.\n"); 1439 info->inner_loop_cond = inner.loop_cond; 1440 } 1441 1442 if (!single_exit (loop)) 1443 return opt_result::failure_at (vect_location, 1444 "not vectorized: multiple exits.\n"); 1445 if (EDGE_COUNT (loop->header->preds) != 2) 1446 return opt_result::failure_at (vect_location, 1447 "not vectorized:" 1448 " too many incoming edges.\n"); 1449 1450 /* We assume that the loop exit condition is at the end of the loop. i.e, 1451 that the loop is represented as a do-while (with a proper if-guard 1452 before the loop if needed), where the loop header contains all the 1453 executable statements, and the latch is empty. */ 1454 if (!empty_block_p (loop->latch) 1455 || !gimple_seq_empty_p (phi_nodes (loop->latch))) 1456 return opt_result::failure_at (vect_location, 1457 "not vectorized: latch block not empty.\n"); 1458 1459 /* Make sure the exit is not abnormal. */ 1460 edge e = single_exit (loop); 1461 if (e->flags & EDGE_ABNORMAL) 1462 return opt_result::failure_at (vect_location, 1463 "not vectorized:" 1464 " abnormal loop exit edge.\n"); 1465 1466 info->loop_cond 1467 = vect_get_loop_niters (loop, &info->assumptions, 1468 &info->number_of_iterations, 1469 &info->number_of_iterationsm1); 1470 if (!info->loop_cond) 1471 return opt_result::failure_at 1472 (vect_location, 1473 "not vectorized: complicated exit condition.\n"); 1474 1475 if (integer_zerop (info->assumptions) 1476 || !info->number_of_iterations 1477 || chrec_contains_undetermined (info->number_of_iterations)) 1478 return opt_result::failure_at 1479 (info->loop_cond, 1480 "not vectorized: number of iterations cannot be computed.\n"); 1481 1482 if (integer_zerop (info->number_of_iterations)) 1483 return opt_result::failure_at 1484 (info->loop_cond, 1485 "not vectorized: number of iterations = 0.\n"); 1486 1487 if (!(tree_fits_shwi_p (info->number_of_iterations) 1488 && tree_to_shwi (info->number_of_iterations) > 0)) 1489 { 1490 if (dump_enabled_p ()) 1491 { 1492 dump_printf_loc (MSG_NOTE, vect_location, 1493 "Symbolic number of iterations is "); 1494 dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations); 1495 dump_printf (MSG_NOTE, "\n"); 1496 } 1497 } 1498 1499 return opt_result::success (); 1500 } 1501 1502 /* Create a loop_vec_info for LOOP with SHARED and the 1503 vect_analyze_loop_form result. */ 1504 1505 loop_vec_info 1506 vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared, 1507 const vect_loop_form_info *info, 1508 loop_vec_info main_loop_info) 1509 { 1510 loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared); 1511 LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1; 1512 LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations; 1513 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations; 1514 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = main_loop_info; 1515 /* Also record the assumptions for versioning. */ 1516 if (!integer_onep (info->assumptions) && !main_loop_info) 1517 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions; 1518 1519 stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (info->loop_cond); 1520 STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type; 1521 if (info->inner_loop_cond) 1522 { 1523 stmt_vec_info inner_loop_cond_info 1524 = loop_vinfo->lookup_stmt (info->inner_loop_cond); 1525 STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type; 1526 /* If we have an estimate on the number of iterations of the inner 1527 loop use that to limit the scale for costing, otherwise use 1528 --param vect-inner-loop-cost-factor literally. */ 1529 widest_int nit; 1530 if (estimated_stmt_executions (loop->inner, &nit)) 1531 LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo) 1532 = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi (); 1533 } 1534 1535 return loop_vinfo; 1536 } 1537 1538 1539 1540 /* Scan the loop stmts and dependent on whether there are any (non-)SLP 1541 statements update the vectorization factor. */ 1542 1543 static void 1544 vect_update_vf_for_slp (loop_vec_info loop_vinfo) 1545 { 1546 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 1547 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); 1548 int nbbs = loop->num_nodes; 1549 poly_uint64 vectorization_factor; 1550 int i; 1551 1552 DUMP_VECT_SCOPE ("vect_update_vf_for_slp"); 1553 1554 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 1555 gcc_assert (known_ne (vectorization_factor, 0U)); 1556 1557 /* If all the stmts in the loop can be SLPed, we perform only SLP, and 1558 vectorization factor of the loop is the unrolling factor required by 1559 the SLP instances. If that unrolling factor is 1, we say, that we 1560 perform pure SLP on loop - cross iteration parallelism is not 1561 exploited. */ 1562 bool only_slp_in_loop = true; 1563 for (i = 0; i < nbbs; i++) 1564 { 1565 basic_block bb = bbs[i]; 1566 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si); 1567 gsi_next (&si)) 1568 { 1569 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ()); 1570 if (!stmt_info) 1571 continue; 1572 if ((STMT_VINFO_RELEVANT_P (stmt_info) 1573 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info))) 1574 && !PURE_SLP_STMT (stmt_info)) 1575 /* STMT needs both SLP and loop-based vectorization. */ 1576 only_slp_in_loop = false; 1577 } 1578 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si); 1579 gsi_next (&si)) 1580 { 1581 if (is_gimple_debug (gsi_stmt (si))) 1582 continue; 1583 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si)); 1584 stmt_info = vect_stmt_to_vectorize (stmt_info); 1585 if ((STMT_VINFO_RELEVANT_P (stmt_info) 1586 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info))) 1587 && !PURE_SLP_STMT (stmt_info)) 1588 /* STMT needs both SLP and loop-based vectorization. */ 1589 only_slp_in_loop = false; 1590 } 1591 } 1592 1593 if (only_slp_in_loop) 1594 { 1595 if (dump_enabled_p ()) 1596 dump_printf_loc (MSG_NOTE, vect_location, 1597 "Loop contains only SLP stmts\n"); 1598 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo); 1599 } 1600 else 1601 { 1602 if (dump_enabled_p ()) 1603 dump_printf_loc (MSG_NOTE, vect_location, 1604 "Loop contains SLP and non-SLP stmts\n"); 1605 /* Both the vectorization factor and unroll factor have the form 1606 GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X, 1607 so they must have a common multiple. */ 1608 vectorization_factor 1609 = force_common_multiple (vectorization_factor, 1610 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo)); 1611 } 1612 1613 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor; 1614 if (dump_enabled_p ()) 1615 { 1616 dump_printf_loc (MSG_NOTE, vect_location, 1617 "Updating vectorization factor to "); 1618 dump_dec (MSG_NOTE, vectorization_factor); 1619 dump_printf (MSG_NOTE, ".\n"); 1620 } 1621 } 1622 1623 /* Return true if STMT_INFO describes a double reduction phi and if 1624 the other phi in the reduction is also relevant for vectorization. 1625 This rejects cases such as: 1626 1627 outer1: 1628 x_1 = PHI <x_3(outer2), ...>; 1629 ... 1630 1631 inner: 1632 x_2 = ...; 1633 ... 1634 1635 outer2: 1636 x_3 = PHI <x_2(inner)>; 1637 1638 if nothing in x_2 or elsewhere makes x_1 relevant. */ 1639 1640 static bool 1641 vect_active_double_reduction_p (stmt_vec_info stmt_info) 1642 { 1643 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def) 1644 return false; 1645 1646 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info)); 1647 } 1648 1649 /* Function vect_analyze_loop_operations. 1650 1651 Scan the loop stmts and make sure they are all vectorizable. */ 1652 1653 static opt_result 1654 vect_analyze_loop_operations (loop_vec_info loop_vinfo) 1655 { 1656 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 1657 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); 1658 int nbbs = loop->num_nodes; 1659 int i; 1660 stmt_vec_info stmt_info; 1661 bool need_to_vectorize = false; 1662 bool ok; 1663 1664 DUMP_VECT_SCOPE ("vect_analyze_loop_operations"); 1665 1666 auto_vec<stmt_info_for_cost> cost_vec; 1667 1668 for (i = 0; i < nbbs; i++) 1669 { 1670 basic_block bb = bbs[i]; 1671 1672 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si); 1673 gsi_next (&si)) 1674 { 1675 gphi *phi = si.phi (); 1676 ok = true; 1677 1678 stmt_info = loop_vinfo->lookup_stmt (phi); 1679 if (dump_enabled_p ()) 1680 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi); 1681 if (virtual_operand_p (gimple_phi_result (phi))) 1682 continue; 1683 1684 /* Inner-loop loop-closed exit phi in outer-loop vectorization 1685 (i.e., a phi in the tail of the outer-loop). */ 1686 if (! is_loop_header_bb_p (bb)) 1687 { 1688 /* FORNOW: we currently don't support the case that these phis 1689 are not used in the outerloop (unless it is double reduction, 1690 i.e., this phi is vect_reduction_def), cause this case 1691 requires to actually do something here. */ 1692 if (STMT_VINFO_LIVE_P (stmt_info) 1693 && !vect_active_double_reduction_p (stmt_info)) 1694 return opt_result::failure_at (phi, 1695 "Unsupported loop-closed phi" 1696 " in outer-loop.\n"); 1697 1698 /* If PHI is used in the outer loop, we check that its operand 1699 is defined in the inner loop. */ 1700 if (STMT_VINFO_RELEVANT_P (stmt_info)) 1701 { 1702 tree phi_op; 1703 1704 if (gimple_phi_num_args (phi) != 1) 1705 return opt_result::failure_at (phi, "unsupported phi"); 1706 1707 phi_op = PHI_ARG_DEF (phi, 0); 1708 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op); 1709 if (!op_def_info) 1710 return opt_result::failure_at (phi, "unsupported phi\n"); 1711 1712 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer 1713 && (STMT_VINFO_RELEVANT (op_def_info) 1714 != vect_used_in_outer_by_reduction)) 1715 return opt_result::failure_at (phi, "unsupported phi\n"); 1716 1717 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def 1718 || (STMT_VINFO_DEF_TYPE (stmt_info) 1719 == vect_double_reduction_def)) 1720 && !vectorizable_lc_phi (loop_vinfo, 1721 stmt_info, NULL, NULL)) 1722 return opt_result::failure_at (phi, "unsupported phi\n"); 1723 } 1724 1725 continue; 1726 } 1727 1728 gcc_assert (stmt_info); 1729 1730 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope 1731 || STMT_VINFO_LIVE_P (stmt_info)) 1732 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def) 1733 /* A scalar-dependence cycle that we don't support. */ 1734 return opt_result::failure_at (phi, 1735 "not vectorized:" 1736 " scalar dependence cycle.\n"); 1737 1738 if (STMT_VINFO_RELEVANT_P (stmt_info)) 1739 { 1740 need_to_vectorize = true; 1741 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def 1742 && ! PURE_SLP_STMT (stmt_info)) 1743 ok = vectorizable_induction (loop_vinfo, 1744 stmt_info, NULL, NULL, 1745 &cost_vec); 1746 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def 1747 || (STMT_VINFO_DEF_TYPE (stmt_info) 1748 == vect_double_reduction_def) 1749 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle) 1750 && ! PURE_SLP_STMT (stmt_info)) 1751 ok = vectorizable_reduction (loop_vinfo, 1752 stmt_info, NULL, NULL, &cost_vec); 1753 } 1754 1755 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */ 1756 if (ok 1757 && STMT_VINFO_LIVE_P (stmt_info) 1758 && !PURE_SLP_STMT (stmt_info)) 1759 ok = vectorizable_live_operation (loop_vinfo, 1760 stmt_info, NULL, NULL, NULL, 1761 -1, false, &cost_vec); 1762 1763 if (!ok) 1764 return opt_result::failure_at (phi, 1765 "not vectorized: relevant phi not " 1766 "supported: %G", 1767 static_cast <gimple *> (phi)); 1768 } 1769 1770 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si); 1771 gsi_next (&si)) 1772 { 1773 gimple *stmt = gsi_stmt (si); 1774 if (!gimple_clobber_p (stmt) 1775 && !is_gimple_debug (stmt)) 1776 { 1777 opt_result res 1778 = vect_analyze_stmt (loop_vinfo, 1779 loop_vinfo->lookup_stmt (stmt), 1780 &need_to_vectorize, 1781 NULL, NULL, &cost_vec); 1782 if (!res) 1783 return res; 1784 } 1785 } 1786 } /* bbs */ 1787 1788 add_stmt_costs (loop_vinfo->vector_costs, &cost_vec); 1789 1790 /* All operations in the loop are either irrelevant (deal with loop 1791 control, or dead), or only used outside the loop and can be moved 1792 out of the loop (e.g. invariants, inductions). The loop can be 1793 optimized away by scalar optimizations. We're better off not 1794 touching this loop. */ 1795 if (!need_to_vectorize) 1796 { 1797 if (dump_enabled_p ()) 1798 dump_printf_loc (MSG_NOTE, vect_location, 1799 "All the computation can be taken out of the loop.\n"); 1800 return opt_result::failure_at 1801 (vect_location, 1802 "not vectorized: redundant loop. no profit to vectorize.\n"); 1803 } 1804 1805 return opt_result::success (); 1806 } 1807 1808 /* Return true if we know that the iteration count is smaller than the 1809 vectorization factor. Return false if it isn't, or if we can't be sure 1810 either way. */ 1811 1812 static bool 1813 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo) 1814 { 1815 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo); 1816 1817 HOST_WIDE_INT max_niter; 1818 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) 1819 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo); 1820 else 1821 max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo)); 1822 1823 if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf) 1824 return true; 1825 1826 return false; 1827 } 1828 1829 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it 1830 is worthwhile to vectorize. Return 1 if definitely yes, 0 if 1831 definitely no, or -1 if it's worth retrying. */ 1832 1833 static int 1834 vect_analyze_loop_costing (loop_vec_info loop_vinfo, 1835 unsigned *suggested_unroll_factor) 1836 { 1837 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 1838 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo); 1839 1840 /* Only loops that can handle partially-populated vectors can have iteration 1841 counts less than the vectorization factor. */ 1842 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)) 1843 { 1844 if (vect_known_niters_smaller_than_vf (loop_vinfo)) 1845 { 1846 if (dump_enabled_p ()) 1847 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1848 "not vectorized: iteration count smaller than " 1849 "vectorization factor.\n"); 1850 return 0; 1851 } 1852 } 1853 1854 /* If using the "very cheap" model. reject cases in which we'd keep 1855 a copy of the scalar code (even if we might be able to vectorize it). */ 1856 if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP 1857 && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) 1858 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) 1859 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))) 1860 { 1861 if (dump_enabled_p ()) 1862 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1863 "some scalar iterations would need to be peeled\n"); 1864 return 0; 1865 } 1866 1867 int min_profitable_iters, min_profitable_estimate; 1868 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters, 1869 &min_profitable_estimate, 1870 suggested_unroll_factor); 1871 1872 if (min_profitable_iters < 0) 1873 { 1874 if (dump_enabled_p ()) 1875 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1876 "not vectorized: vectorization not profitable.\n"); 1877 if (dump_enabled_p ()) 1878 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1879 "not vectorized: vector version will never be " 1880 "profitable.\n"); 1881 return -1; 1882 } 1883 1884 int min_scalar_loop_bound = (param_min_vect_loop_bound 1885 * assumed_vf); 1886 1887 /* Use the cost model only if it is more conservative than user specified 1888 threshold. */ 1889 unsigned int th = (unsigned) MAX (min_scalar_loop_bound, 1890 min_profitable_iters); 1891 1892 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th; 1893 1894 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) 1895 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th) 1896 { 1897 if (dump_enabled_p ()) 1898 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1899 "not vectorized: vectorization not profitable.\n"); 1900 if (dump_enabled_p ()) 1901 dump_printf_loc (MSG_NOTE, vect_location, 1902 "not vectorized: iteration count smaller than user " 1903 "specified loop bound parameter or minimum profitable " 1904 "iterations (whichever is more conservative).\n"); 1905 return 0; 1906 } 1907 1908 /* The static profitablity threshold min_profitable_estimate includes 1909 the cost of having to check at runtime whether the scalar loop 1910 should be used instead. If it turns out that we don't need or want 1911 such a check, the threshold we should use for the static estimate 1912 is simply the point at which the vector loop becomes more profitable 1913 than the scalar loop. */ 1914 if (min_profitable_estimate > min_profitable_iters 1915 && !LOOP_REQUIRES_VERSIONING (loop_vinfo) 1916 && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) 1917 && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) 1918 && !vect_apply_runtime_profitability_check_p (loop_vinfo)) 1919 { 1920 if (dump_enabled_p ()) 1921 dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime" 1922 " choice between the scalar and vector loops\n"); 1923 min_profitable_estimate = min_profitable_iters; 1924 } 1925 1926 /* If the vector loop needs multiple iterations to be beneficial then 1927 things are probably too close to call, and the conservative thing 1928 would be to stick with the scalar code. */ 1929 if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP 1930 && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo)) 1931 { 1932 if (dump_enabled_p ()) 1933 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1934 "one iteration of the vector loop would be" 1935 " more expensive than the equivalent number of" 1936 " iterations of the scalar loop\n"); 1937 return 0; 1938 } 1939 1940 HOST_WIDE_INT estimated_niter; 1941 1942 /* If we are vectorizing an epilogue then we know the maximum number of 1943 scalar iterations it will cover is at least one lower than the 1944 vectorization factor of the main loop. */ 1945 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)) 1946 estimated_niter 1947 = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1; 1948 else 1949 { 1950 estimated_niter = estimated_stmt_executions_int (loop); 1951 if (estimated_niter == -1) 1952 estimated_niter = likely_max_stmt_executions_int (loop); 1953 } 1954 if (estimated_niter != -1 1955 && ((unsigned HOST_WIDE_INT) estimated_niter 1956 < MAX (th, (unsigned) min_profitable_estimate))) 1957 { 1958 if (dump_enabled_p ()) 1959 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1960 "not vectorized: estimated iteration count too " 1961 "small.\n"); 1962 if (dump_enabled_p ()) 1963 dump_printf_loc (MSG_NOTE, vect_location, 1964 "not vectorized: estimated iteration count smaller " 1965 "than specified loop bound parameter or minimum " 1966 "profitable iterations (whichever is more " 1967 "conservative).\n"); 1968 return -1; 1969 } 1970 1971 return 1; 1972 } 1973 1974 static opt_result 1975 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs, 1976 vec<data_reference_p> *datarefs, 1977 unsigned int *n_stmts) 1978 { 1979 *n_stmts = 0; 1980 for (unsigned i = 0; i < loop->num_nodes; i++) 1981 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]); 1982 !gsi_end_p (gsi); gsi_next (&gsi)) 1983 { 1984 gimple *stmt = gsi_stmt (gsi); 1985 if (is_gimple_debug (stmt)) 1986 continue; 1987 ++(*n_stmts); 1988 opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs, 1989 NULL, 0); 1990 if (!res) 1991 { 1992 if (is_gimple_call (stmt) && loop->safelen) 1993 { 1994 tree fndecl = gimple_call_fndecl (stmt), op; 1995 if (fndecl != NULL_TREE) 1996 { 1997 cgraph_node *node = cgraph_node::get (fndecl); 1998 if (node != NULL && node->simd_clones != NULL) 1999 { 2000 unsigned int j, n = gimple_call_num_args (stmt); 2001 for (j = 0; j < n; j++) 2002 { 2003 op = gimple_call_arg (stmt, j); 2004 if (DECL_P (op) 2005 || (REFERENCE_CLASS_P (op) 2006 && get_base_address (op))) 2007 break; 2008 } 2009 op = gimple_call_lhs (stmt); 2010 /* Ignore #pragma omp declare simd functions 2011 if they don't have data references in the 2012 call stmt itself. */ 2013 if (j == n 2014 && !(op 2015 && (DECL_P (op) 2016 || (REFERENCE_CLASS_P (op) 2017 && get_base_address (op))))) 2018 continue; 2019 } 2020 } 2021 } 2022 return res; 2023 } 2024 /* If dependence analysis will give up due to the limit on the 2025 number of datarefs stop here and fail fatally. */ 2026 if (datarefs->length () 2027 > (unsigned)param_loop_max_datarefs_for_datadeps) 2028 return opt_result::failure_at (stmt, "exceeded param " 2029 "loop-max-datarefs-for-datadeps\n"); 2030 } 2031 return opt_result::success (); 2032 } 2033 2034 /* Look for SLP-only access groups and turn each individual access into its own 2035 group. */ 2036 static void 2037 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo) 2038 { 2039 unsigned int i; 2040 struct data_reference *dr; 2041 2042 DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups"); 2043 2044 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo); 2045 FOR_EACH_VEC_ELT (datarefs, i, dr) 2046 { 2047 gcc_assert (DR_REF (dr)); 2048 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr)); 2049 2050 /* Check if the load is a part of an interleaving chain. */ 2051 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) 2052 { 2053 stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info); 2054 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element); 2055 unsigned int group_size = DR_GROUP_SIZE (first_element); 2056 2057 /* Check if SLP-only groups. */ 2058 if (!STMT_SLP_TYPE (stmt_info) 2059 && STMT_VINFO_SLP_VECT_ONLY (first_element)) 2060 { 2061 /* Dissolve the group. */ 2062 STMT_VINFO_SLP_VECT_ONLY (first_element) = false; 2063 2064 stmt_vec_info vinfo = first_element; 2065 while (vinfo) 2066 { 2067 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo); 2068 DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo; 2069 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL; 2070 DR_GROUP_SIZE (vinfo) = 1; 2071 if (STMT_VINFO_STRIDED_P (first_element)) 2072 DR_GROUP_GAP (vinfo) = 0; 2073 else 2074 DR_GROUP_GAP (vinfo) = group_size - 1; 2075 /* Duplicate and adjust alignment info, it needs to 2076 be present on each group leader, see dr_misalignment. */ 2077 if (vinfo != first_element) 2078 { 2079 dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo); 2080 dr_info2->target_alignment = dr_info->target_alignment; 2081 int misalignment = dr_info->misalignment; 2082 if (misalignment != DR_MISALIGNMENT_UNKNOWN) 2083 { 2084 HOST_WIDE_INT diff 2085 = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr)) 2086 - TREE_INT_CST_LOW (DR_INIT (dr_info->dr))); 2087 unsigned HOST_WIDE_INT align_c 2088 = dr_info->target_alignment.to_constant (); 2089 misalignment = (misalignment + diff) % align_c; 2090 } 2091 dr_info2->misalignment = misalignment; 2092 } 2093 vinfo = next; 2094 } 2095 } 2096 } 2097 } 2098 } 2099 2100 /* Determine if operating on full vectors for LOOP_VINFO might leave 2101 some scalar iterations still to do. If so, decide how we should 2102 handle those scalar iterations. The possibilities are: 2103 2104 (1) Make LOOP_VINFO operate on partial vectors instead of full vectors. 2105 In this case: 2106 2107 LOOP_VINFO_USING_PARTIAL_VECTORS_P == true 2108 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false 2109 LOOP_VINFO_PEELING_FOR_NITER == false 2110 2111 (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop 2112 to handle the remaining scalar iterations. In this case: 2113 2114 LOOP_VINFO_USING_PARTIAL_VECTORS_P == false 2115 LOOP_VINFO_PEELING_FOR_NITER == true 2116 2117 There are two choices: 2118 2119 (2a) Consider vectorizing the epilogue loop at the same VF as the 2120 main loop, but using partial vectors instead of full vectors. 2121 In this case: 2122 2123 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true 2124 2125 (2b) Consider vectorizing the epilogue loop at lower VFs only. 2126 In this case: 2127 2128 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false 2129 2130 When FOR_EPILOGUE_P is true, make this determination based on the 2131 assumption that LOOP_VINFO is an epilogue loop, otherwise make it 2132 based on the assumption that LOOP_VINFO is the main loop. The caller 2133 has made sure that the number of iterations is set appropriately for 2134 this value of FOR_EPILOGUE_P. */ 2135 2136 opt_result 2137 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo, 2138 bool for_epilogue_p) 2139 { 2140 /* Determine whether there would be any scalar iterations left over. */ 2141 bool need_peeling_or_partial_vectors_p 2142 = vect_need_peeling_or_partial_vectors_p (loop_vinfo); 2143 2144 /* Decide whether to vectorize the loop with partial vectors. */ 2145 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false; 2146 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false; 2147 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) 2148 && need_peeling_or_partial_vectors_p) 2149 { 2150 /* For partial-vector-usage=1, try to push the handling of partial 2151 vectors to the epilogue, with the main loop continuing to operate 2152 on full vectors. 2153 2154 If we are unrolling we also do not want to use partial vectors. This 2155 is to avoid the overhead of generating multiple masks and also to 2156 avoid having to execute entire iterations of FALSE masked instructions 2157 when dealing with one or less full iterations. 2158 2159 ??? We could then end up failing to use partial vectors if we 2160 decide to peel iterations into a prologue, and if the main loop 2161 then ends up processing fewer than VF iterations. */ 2162 if ((param_vect_partial_vector_usage == 1 2163 || loop_vinfo->suggested_unroll_factor > 1) 2164 && !LOOP_VINFO_EPILOGUE_P (loop_vinfo) 2165 && !vect_known_niters_smaller_than_vf (loop_vinfo)) 2166 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true; 2167 else 2168 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true; 2169 } 2170 2171 if (dump_enabled_p ()) 2172 { 2173 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)) 2174 dump_printf_loc (MSG_NOTE, vect_location, 2175 "operating on partial vectors%s.\n", 2176 for_epilogue_p ? " for epilogue loop" : ""); 2177 else 2178 dump_printf_loc (MSG_NOTE, vect_location, 2179 "operating only on full vectors%s.\n", 2180 for_epilogue_p ? " for epilogue loop" : ""); 2181 } 2182 2183 if (for_epilogue_p) 2184 { 2185 loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo); 2186 gcc_assert (orig_loop_vinfo); 2187 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)) 2188 gcc_assert (known_lt (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 2189 LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo))); 2190 } 2191 2192 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) 2193 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)) 2194 { 2195 /* Check that the loop processes at least one full vector. */ 2196 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 2197 tree scalar_niters = LOOP_VINFO_NITERS (loop_vinfo); 2198 if (known_lt (wi::to_widest (scalar_niters), vf)) 2199 return opt_result::failure_at (vect_location, 2200 "loop does not have enough iterations" 2201 " to support vectorization.\n"); 2202 2203 /* If we need to peel an extra epilogue iteration to handle data 2204 accesses with gaps, check that there are enough scalar iterations 2205 available. 2206 2207 The check above is redundant with this one when peeling for gaps, 2208 but the distinction is useful for diagnostics. */ 2209 tree scalar_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo); 2210 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) 2211 && known_lt (wi::to_widest (scalar_nitersm1), vf)) 2212 return opt_result::failure_at (vect_location, 2213 "loop does not have enough iterations" 2214 " to support peeling for gaps.\n"); 2215 } 2216 2217 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) 2218 = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) 2219 && need_peeling_or_partial_vectors_p); 2220 2221 return opt_result::success (); 2222 } 2223 2224 /* Function vect_analyze_loop_2. 2225 2226 Apply a set of analyses on LOOP, and create a loop_vec_info struct 2227 for it. The different analyses will record information in the 2228 loop_vec_info struct. */ 2229 static opt_result 2230 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, 2231 unsigned *suggested_unroll_factor) 2232 { 2233 opt_result ok = opt_result::success (); 2234 int res; 2235 unsigned int max_vf = MAX_VECTORIZATION_FACTOR; 2236 poly_uint64 min_vf = 2; 2237 loop_vec_info orig_loop_vinfo = NULL; 2238 2239 /* If we are dealing with an epilogue then orig_loop_vinfo points to the 2240 loop_vec_info of the first vectorized loop. */ 2241 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)) 2242 orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo); 2243 else 2244 orig_loop_vinfo = loop_vinfo; 2245 gcc_assert (orig_loop_vinfo); 2246 2247 /* The first group of checks is independent of the vector size. */ 2248 fatal = true; 2249 2250 if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo) 2251 && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo))) 2252 return opt_result::failure_at (vect_location, 2253 "not vectorized: simd if(0)\n"); 2254 2255 /* Find all data references in the loop (which correspond to vdefs/vuses) 2256 and analyze their evolution in the loop. */ 2257 2258 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo); 2259 2260 /* Gather the data references and count stmts in the loop. */ 2261 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ()) 2262 { 2263 opt_result res 2264 = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo), 2265 &LOOP_VINFO_DATAREFS (loop_vinfo), 2266 &LOOP_VINFO_N_STMTS (loop_vinfo)); 2267 if (!res) 2268 { 2269 if (dump_enabled_p ()) 2270 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2271 "not vectorized: loop contains function " 2272 "calls or data references that cannot " 2273 "be analyzed\n"); 2274 return res; 2275 } 2276 loop_vinfo->shared->save_datarefs (); 2277 } 2278 else 2279 loop_vinfo->shared->check_datarefs (); 2280 2281 /* Analyze the data references and also adjust the minimal 2282 vectorization factor according to the loads and stores. */ 2283 2284 ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal); 2285 if (!ok) 2286 { 2287 if (dump_enabled_p ()) 2288 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2289 "bad data references.\n"); 2290 return ok; 2291 } 2292 2293 /* Classify all cross-iteration scalar data-flow cycles. 2294 Cross-iteration cycles caused by virtual phis are analyzed separately. */ 2295 vect_analyze_scalar_cycles (loop_vinfo); 2296 2297 vect_pattern_recog (loop_vinfo); 2298 2299 vect_fixup_scalar_cycles_with_patterns (loop_vinfo); 2300 2301 /* Analyze the access patterns of the data-refs in the loop (consecutive, 2302 complex, etc.). FORNOW: Only handle consecutive access pattern. */ 2303 2304 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL); 2305 if (!ok) 2306 { 2307 if (dump_enabled_p ()) 2308 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2309 "bad data access.\n"); 2310 return ok; 2311 } 2312 2313 /* Data-flow analysis to detect stmts that do not need to be vectorized. */ 2314 2315 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal); 2316 if (!ok) 2317 { 2318 if (dump_enabled_p ()) 2319 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2320 "unexpected pattern.\n"); 2321 return ok; 2322 } 2323 2324 /* While the rest of the analysis below depends on it in some way. */ 2325 fatal = false; 2326 2327 /* Analyze data dependences between the data-refs in the loop 2328 and adjust the maximum vectorization factor according to 2329 the dependences. 2330 FORNOW: fail at the first data dependence that we encounter. */ 2331 2332 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf); 2333 if (!ok) 2334 { 2335 if (dump_enabled_p ()) 2336 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2337 "bad data dependence.\n"); 2338 return ok; 2339 } 2340 if (max_vf != MAX_VECTORIZATION_FACTOR 2341 && maybe_lt (max_vf, min_vf)) 2342 return opt_result::failure_at (vect_location, "bad data dependence.\n"); 2343 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf; 2344 2345 ok = vect_determine_vectorization_factor (loop_vinfo); 2346 if (!ok) 2347 { 2348 if (dump_enabled_p ()) 2349 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2350 "can't determine vectorization factor.\n"); 2351 return ok; 2352 } 2353 if (max_vf != MAX_VECTORIZATION_FACTOR 2354 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo))) 2355 return opt_result::failure_at (vect_location, "bad data dependence.\n"); 2356 2357 /* Compute the scalar iteration cost. */ 2358 vect_compute_single_scalar_iteration_cost (loop_vinfo); 2359 2360 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 2361 2362 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */ 2363 ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo)); 2364 if (!ok) 2365 return ok; 2366 2367 /* If there are any SLP instances mark them as pure_slp. */ 2368 bool slp = vect_make_slp_decision (loop_vinfo); 2369 if (slp) 2370 { 2371 /* Find stmts that need to be both vectorized and SLPed. */ 2372 vect_detect_hybrid_slp (loop_vinfo); 2373 2374 /* Update the vectorization factor based on the SLP decision. */ 2375 vect_update_vf_for_slp (loop_vinfo); 2376 2377 /* Optimize the SLP graph with the vectorization factor fixed. */ 2378 vect_optimize_slp (loop_vinfo); 2379 2380 /* Gather the loads reachable from the SLP graph entries. */ 2381 vect_gather_slp_loads (loop_vinfo); 2382 } 2383 2384 bool saved_can_use_partial_vectors_p 2385 = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo); 2386 2387 /* We don't expect to have to roll back to anything other than an empty 2388 set of rgroups. */ 2389 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ()); 2390 2391 /* This is the point where we can re-start analysis with SLP forced off. */ 2392 start_over: 2393 2394 /* Apply the suggested unrolling factor, this was determined by the backend 2395 during finish_cost the first time we ran the analyzis for this 2396 vector mode. */ 2397 if (loop_vinfo->suggested_unroll_factor > 1) 2398 LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor; 2399 2400 /* Now the vectorization factor is final. */ 2401 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 2402 gcc_assert (known_ne (vectorization_factor, 0U)); 2403 2404 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ()) 2405 { 2406 dump_printf_loc (MSG_NOTE, vect_location, 2407 "vectorization_factor = "); 2408 dump_dec (MSG_NOTE, vectorization_factor); 2409 dump_printf (MSG_NOTE, ", niters = %wd\n", 2410 LOOP_VINFO_INT_NITERS (loop_vinfo)); 2411 } 2412 2413 loop_vinfo->vector_costs = init_cost (loop_vinfo, false); 2414 2415 /* Analyze the alignment of the data-refs in the loop. 2416 Fail if a data reference is found that cannot be vectorized. */ 2417 2418 ok = vect_analyze_data_refs_alignment (loop_vinfo); 2419 if (!ok) 2420 { 2421 if (dump_enabled_p ()) 2422 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2423 "bad data alignment.\n"); 2424 return ok; 2425 } 2426 2427 /* Prune the list of ddrs to be tested at run-time by versioning for alias. 2428 It is important to call pruning after vect_analyze_data_ref_accesses, 2429 since we use grouping information gathered by interleaving analysis. */ 2430 ok = vect_prune_runtime_alias_test_list (loop_vinfo); 2431 if (!ok) 2432 return ok; 2433 2434 /* Do not invoke vect_enhance_data_refs_alignment for epilogue 2435 vectorization, since we do not want to add extra peeling or 2436 add versioning for alignment. */ 2437 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)) 2438 /* This pass will decide on using loop versioning and/or loop peeling in 2439 order to enhance the alignment of data references in the loop. */ 2440 ok = vect_enhance_data_refs_alignment (loop_vinfo); 2441 if (!ok) 2442 return ok; 2443 2444 if (slp) 2445 { 2446 /* Analyze operations in the SLP instances. Note this may 2447 remove unsupported SLP instances which makes the above 2448 SLP kind detection invalid. */ 2449 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length (); 2450 vect_slp_analyze_operations (loop_vinfo); 2451 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size) 2452 { 2453 ok = opt_result::failure_at (vect_location, 2454 "unsupported SLP instances\n"); 2455 goto again; 2456 } 2457 2458 /* Check whether any load in ALL SLP instances is possibly permuted. */ 2459 slp_tree load_node, slp_root; 2460 unsigned i, x; 2461 slp_instance instance; 2462 bool can_use_lanes = true; 2463 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance) 2464 { 2465 slp_root = SLP_INSTANCE_TREE (instance); 2466 int group_size = SLP_TREE_LANES (slp_root); 2467 tree vectype = SLP_TREE_VECTYPE (slp_root); 2468 bool loads_permuted = false; 2469 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node) 2470 { 2471 if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ()) 2472 continue; 2473 unsigned j; 2474 stmt_vec_info load_info; 2475 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info) 2476 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j) 2477 { 2478 loads_permuted = true; 2479 break; 2480 } 2481 } 2482 2483 /* If the loads and stores can be handled with load/store-lane 2484 instructions record it and move on to the next instance. */ 2485 if (loads_permuted 2486 && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store 2487 && vect_store_lanes_supported (vectype, group_size, false)) 2488 { 2489 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node) 2490 { 2491 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT 2492 (SLP_TREE_SCALAR_STMTS (load_node)[0]); 2493 /* Use SLP for strided accesses (or if we can't 2494 load-lanes). */ 2495 if (STMT_VINFO_STRIDED_P (stmt_vinfo) 2496 || ! vect_load_lanes_supported 2497 (STMT_VINFO_VECTYPE (stmt_vinfo), 2498 DR_GROUP_SIZE (stmt_vinfo), false)) 2499 break; 2500 } 2501 2502 can_use_lanes 2503 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length (); 2504 2505 if (can_use_lanes && dump_enabled_p ()) 2506 dump_printf_loc (MSG_NOTE, vect_location, 2507 "SLP instance %p can use load/store-lanes\n", 2508 instance); 2509 } 2510 else 2511 { 2512 can_use_lanes = false; 2513 break; 2514 } 2515 } 2516 2517 /* If all SLP instances can use load/store-lanes abort SLP and try again 2518 with SLP disabled. */ 2519 if (can_use_lanes) 2520 { 2521 ok = opt_result::failure_at (vect_location, 2522 "Built SLP cancelled: can use " 2523 "load/store-lanes\n"); 2524 if (dump_enabled_p ()) 2525 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2526 "Built SLP cancelled: all SLP instances support " 2527 "load/store-lanes\n"); 2528 goto again; 2529 } 2530 } 2531 2532 /* Dissolve SLP-only groups. */ 2533 vect_dissolve_slp_only_groups (loop_vinfo); 2534 2535 /* Scan all the remaining operations in the loop that are not subject 2536 to SLP and make sure they are vectorizable. */ 2537 ok = vect_analyze_loop_operations (loop_vinfo); 2538 if (!ok) 2539 { 2540 if (dump_enabled_p ()) 2541 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2542 "bad operation or unsupported loop bound.\n"); 2543 return ok; 2544 } 2545 2546 /* For now, we don't expect to mix both masking and length approaches for one 2547 loop, disable it if both are recorded. */ 2548 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) 2549 && !LOOP_VINFO_MASKS (loop_vinfo).is_empty () 2550 && !LOOP_VINFO_LENS (loop_vinfo).is_empty ()) 2551 { 2552 if (dump_enabled_p ()) 2553 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2554 "can't vectorize a loop with partial vectors" 2555 " because we don't expect to mix different" 2556 " approaches with partial vectors for the" 2557 " same loop.\n"); 2558 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; 2559 } 2560 2561 /* If we still have the option of using partial vectors, 2562 check whether we can generate the necessary loop controls. */ 2563 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) 2564 && !vect_verify_full_masking (loop_vinfo) 2565 && !vect_verify_loop_lens (loop_vinfo)) 2566 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; 2567 2568 /* If we're vectorizing an epilogue loop, the vectorized loop either needs 2569 to be able to handle fewer than VF scalars, or needs to have a lower VF 2570 than the main loop. */ 2571 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo) 2572 && !LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) 2573 && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 2574 LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo))) 2575 return opt_result::failure_at (vect_location, 2576 "Vectorization factor too high for" 2577 " epilogue loop.\n"); 2578 2579 /* Decide whether this loop_vinfo should use partial vectors or peeling, 2580 assuming that the loop will be used as a main loop. We will redo 2581 this analysis later if we instead decide to use the loop as an 2582 epilogue loop. */ 2583 ok = vect_determine_partial_vectors_and_peeling (loop_vinfo, false); 2584 if (!ok) 2585 return ok; 2586 2587 /* Check the costings of the loop make vectorizing worthwhile. */ 2588 res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor); 2589 if (res < 0) 2590 { 2591 ok = opt_result::failure_at (vect_location, 2592 "Loop costings may not be worthwhile.\n"); 2593 goto again; 2594 } 2595 if (!res) 2596 return opt_result::failure_at (vect_location, 2597 "Loop costings not worthwhile.\n"); 2598 2599 /* If an epilogue loop is required make sure we can create one. */ 2600 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) 2601 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)) 2602 { 2603 if (dump_enabled_p ()) 2604 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n"); 2605 if (!vect_can_advance_ivs_p (loop_vinfo) 2606 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo), 2607 single_exit (LOOP_VINFO_LOOP 2608 (loop_vinfo)))) 2609 { 2610 ok = opt_result::failure_at (vect_location, 2611 "not vectorized: can't create required " 2612 "epilog loop\n"); 2613 goto again; 2614 } 2615 } 2616 2617 /* During peeling, we need to check if number of loop iterations is 2618 enough for both peeled prolog loop and vector loop. This check 2619 can be merged along with threshold check of loop versioning, so 2620 increase threshold for this case if necessary. 2621 2622 If we are analyzing an epilogue we still want to check what its 2623 versioning threshold would be. If we decide to vectorize the epilogues we 2624 will want to use the lowest versioning threshold of all epilogues and main 2625 loop. This will enable us to enter a vectorized epilogue even when 2626 versioning the loop. We can't simply check whether the epilogue requires 2627 versioning though since we may have skipped some versioning checks when 2628 analyzing the epilogue. For instance, checks for alias versioning will be 2629 skipped when dealing with epilogues as we assume we already checked them 2630 for the main loop. So instead we always check the 'orig_loop_vinfo'. */ 2631 if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo)) 2632 { 2633 poly_uint64 niters_th = 0; 2634 unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo); 2635 2636 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo)) 2637 { 2638 /* Niters for peeled prolog loop. */ 2639 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0) 2640 { 2641 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo); 2642 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt); 2643 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1; 2644 } 2645 else 2646 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); 2647 } 2648 2649 /* Niters for at least one iteration of vectorized loop. */ 2650 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)) 2651 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo); 2652 /* One additional iteration because of peeling for gap. */ 2653 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)) 2654 niters_th += 1; 2655 2656 /* Use the same condition as vect_transform_loop to decide when to use 2657 the cost to determine a versioning threshold. */ 2658 if (vect_apply_runtime_profitability_check_p (loop_vinfo) 2659 && ordered_p (th, niters_th)) 2660 niters_th = ordered_max (poly_uint64 (th), niters_th); 2661 2662 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th; 2663 } 2664 2665 gcc_assert (known_eq (vectorization_factor, 2666 LOOP_VINFO_VECT_FACTOR (loop_vinfo))); 2667 2668 /* Ok to vectorize! */ 2669 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1; 2670 return opt_result::success (); 2671 2672 again: 2673 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */ 2674 gcc_assert (!ok); 2675 2676 /* Try again with SLP forced off but if we didn't do any SLP there is 2677 no point in re-trying. */ 2678 if (!slp) 2679 return ok; 2680 2681 /* If there are reduction chains re-trying will fail anyway. */ 2682 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ()) 2683 return ok; 2684 2685 /* Likewise if the grouped loads or stores in the SLP cannot be handled 2686 via interleaving or lane instructions. */ 2687 slp_instance instance; 2688 slp_tree node; 2689 unsigned i, j; 2690 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance) 2691 { 2692 stmt_vec_info vinfo; 2693 vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]; 2694 if (! STMT_VINFO_GROUPED_ACCESS (vinfo)) 2695 continue; 2696 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo); 2697 unsigned int size = DR_GROUP_SIZE (vinfo); 2698 tree vectype = STMT_VINFO_VECTYPE (vinfo); 2699 if (! vect_store_lanes_supported (vectype, size, false) 2700 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U) 2701 && ! vect_grouped_store_supported (vectype, size)) 2702 return opt_result::failure_at (vinfo->stmt, 2703 "unsupported grouped store\n"); 2704 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node) 2705 { 2706 vinfo = SLP_TREE_SCALAR_STMTS (node)[0]; 2707 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo); 2708 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo); 2709 size = DR_GROUP_SIZE (vinfo); 2710 vectype = STMT_VINFO_VECTYPE (vinfo); 2711 if (! vect_load_lanes_supported (vectype, size, false) 2712 && ! vect_grouped_load_supported (vectype, single_element_p, 2713 size)) 2714 return opt_result::failure_at (vinfo->stmt, 2715 "unsupported grouped load\n"); 2716 } 2717 } 2718 2719 if (dump_enabled_p ()) 2720 dump_printf_loc (MSG_NOTE, vect_location, 2721 "re-trying with SLP disabled\n"); 2722 2723 /* Roll back state appropriately. No SLP this time. */ 2724 slp = false; 2725 /* Restore vectorization factor as it were without SLP. */ 2726 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor; 2727 /* Free the SLP instances. */ 2728 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance) 2729 vect_free_slp_instance (instance); 2730 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release (); 2731 /* Reset SLP type to loop_vect on all stmts. */ 2732 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i) 2733 { 2734 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i]; 2735 for (gimple_stmt_iterator si = gsi_start_phis (bb); 2736 !gsi_end_p (si); gsi_next (&si)) 2737 { 2738 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si)); 2739 STMT_SLP_TYPE (stmt_info) = loop_vect; 2740 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def 2741 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def) 2742 { 2743 /* vectorizable_reduction adjusts reduction stmt def-types, 2744 restore them to that of the PHI. */ 2745 STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info)) 2746 = STMT_VINFO_DEF_TYPE (stmt_info); 2747 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize 2748 (STMT_VINFO_REDUC_DEF (stmt_info))) 2749 = STMT_VINFO_DEF_TYPE (stmt_info); 2750 } 2751 } 2752 for (gimple_stmt_iterator si = gsi_start_bb (bb); 2753 !gsi_end_p (si); gsi_next (&si)) 2754 { 2755 if (is_gimple_debug (gsi_stmt (si))) 2756 continue; 2757 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si)); 2758 STMT_SLP_TYPE (stmt_info) = loop_vect; 2759 if (STMT_VINFO_IN_PATTERN_P (stmt_info)) 2760 { 2761 stmt_vec_info pattern_stmt_info 2762 = STMT_VINFO_RELATED_STMT (stmt_info); 2763 if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info)) 2764 STMT_VINFO_IN_PATTERN_P (stmt_info) = false; 2765 2766 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info); 2767 STMT_SLP_TYPE (pattern_stmt_info) = loop_vect; 2768 for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq); 2769 !gsi_end_p (pi); gsi_next (&pi)) 2770 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi))) 2771 = loop_vect; 2772 } 2773 } 2774 } 2775 /* Free optimized alias test DDRS. */ 2776 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0); 2777 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release (); 2778 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release (); 2779 /* Reset target cost data. */ 2780 delete loop_vinfo->vector_costs; 2781 loop_vinfo->vector_costs = nullptr; 2782 /* Reset accumulated rgroup information. */ 2783 release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo)); 2784 release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo)); 2785 /* Reset assorted flags. */ 2786 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false; 2787 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false; 2788 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0; 2789 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0; 2790 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) 2791 = saved_can_use_partial_vectors_p; 2792 2793 goto start_over; 2794 } 2795 2796 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears 2797 to be better than vectorizing it using OLD_LOOP_VINFO. Assume that 2798 OLD_LOOP_VINFO is better unless something specifically indicates 2799 otherwise. 2800 2801 Note that this deliberately isn't a partial order. */ 2802 2803 static bool 2804 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo, 2805 loop_vec_info old_loop_vinfo) 2806 { 2807 struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo); 2808 gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop); 2809 2810 poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo); 2811 poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo); 2812 2813 /* Always prefer a VF of loop->simdlen over any other VF. */ 2814 if (loop->simdlen) 2815 { 2816 bool new_simdlen_p = known_eq (new_vf, loop->simdlen); 2817 bool old_simdlen_p = known_eq (old_vf, loop->simdlen); 2818 if (new_simdlen_p != old_simdlen_p) 2819 return new_simdlen_p; 2820 } 2821 2822 const auto *old_costs = old_loop_vinfo->vector_costs; 2823 const auto *new_costs = new_loop_vinfo->vector_costs; 2824 if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo)) 2825 return new_costs->better_epilogue_loop_than_p (old_costs, main_loop); 2826 2827 return new_costs->better_main_loop_than_p (old_costs); 2828 } 2829 2830 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return 2831 true if we should. */ 2832 2833 static bool 2834 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo, 2835 loop_vec_info old_loop_vinfo) 2836 { 2837 if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo)) 2838 return false; 2839 2840 if (dump_enabled_p ()) 2841 dump_printf_loc (MSG_NOTE, vect_location, 2842 "***** Preferring vector mode %s to vector mode %s\n", 2843 GET_MODE_NAME (new_loop_vinfo->vector_mode), 2844 GET_MODE_NAME (old_loop_vinfo->vector_mode)); 2845 return true; 2846 } 2847 2848 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is 2849 not NULL. Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance 2850 MODE_I to the next mode useful to analyze. 2851 Return the loop_vinfo on success and wrapped null on failure. */ 2852 2853 static opt_loop_vec_info 2854 vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared, 2855 const vect_loop_form_info *loop_form_info, 2856 loop_vec_info main_loop_vinfo, 2857 const vector_modes &vector_modes, unsigned &mode_i, 2858 machine_mode &autodetected_vector_mode, 2859 bool &fatal) 2860 { 2861 loop_vec_info loop_vinfo 2862 = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo); 2863 2864 machine_mode vector_mode = vector_modes[mode_i]; 2865 loop_vinfo->vector_mode = vector_mode; 2866 unsigned int suggested_unroll_factor = 1; 2867 2868 /* Run the main analysis. */ 2869 opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal, 2870 &suggested_unroll_factor); 2871 if (dump_enabled_p ()) 2872 dump_printf_loc (MSG_NOTE, vect_location, 2873 "***** Analysis %s with vector mode %s\n", 2874 res ? "succeeded" : " failed", 2875 GET_MODE_NAME (loop_vinfo->vector_mode)); 2876 2877 if (!main_loop_vinfo && suggested_unroll_factor > 1) 2878 { 2879 if (dump_enabled_p ()) 2880 dump_printf_loc (MSG_NOTE, vect_location, 2881 "***** Re-trying analysis for unrolling" 2882 " with unroll factor %d.\n", 2883 suggested_unroll_factor); 2884 loop_vec_info unroll_vinfo 2885 = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo); 2886 unroll_vinfo->vector_mode = vector_mode; 2887 unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor; 2888 opt_result new_res = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL); 2889 if (new_res) 2890 { 2891 delete loop_vinfo; 2892 loop_vinfo = unroll_vinfo; 2893 } 2894 else 2895 delete unroll_vinfo; 2896 } 2897 2898 /* Remember the autodetected vector mode. */ 2899 if (vector_mode == VOIDmode) 2900 autodetected_vector_mode = loop_vinfo->vector_mode; 2901 2902 /* Advance mode_i, first skipping modes that would result in the 2903 same analysis result. */ 2904 while (mode_i + 1 < vector_modes.length () 2905 && vect_chooses_same_modes_p (loop_vinfo, 2906 vector_modes[mode_i + 1])) 2907 { 2908 if (dump_enabled_p ()) 2909 dump_printf_loc (MSG_NOTE, vect_location, 2910 "***** The result for vector mode %s would" 2911 " be the same\n", 2912 GET_MODE_NAME (vector_modes[mode_i + 1])); 2913 mode_i += 1; 2914 } 2915 if (mode_i + 1 < vector_modes.length () 2916 && VECTOR_MODE_P (autodetected_vector_mode) 2917 && (related_vector_mode (vector_modes[mode_i + 1], 2918 GET_MODE_INNER (autodetected_vector_mode)) 2919 == autodetected_vector_mode) 2920 && (related_vector_mode (autodetected_vector_mode, 2921 GET_MODE_INNER (vector_modes[mode_i + 1])) 2922 == vector_modes[mode_i + 1])) 2923 { 2924 if (dump_enabled_p ()) 2925 dump_printf_loc (MSG_NOTE, vect_location, 2926 "***** Skipping vector mode %s, which would" 2927 " repeat the analysis for %s\n", 2928 GET_MODE_NAME (vector_modes[mode_i + 1]), 2929 GET_MODE_NAME (autodetected_vector_mode)); 2930 mode_i += 1; 2931 } 2932 mode_i++; 2933 2934 if (!res) 2935 { 2936 delete loop_vinfo; 2937 if (fatal) 2938 gcc_checking_assert (main_loop_vinfo == NULL); 2939 return opt_loop_vec_info::propagate_failure (res); 2940 } 2941 2942 return opt_loop_vec_info::success (loop_vinfo); 2943 } 2944 2945 /* Function vect_analyze_loop. 2946 2947 Apply a set of analyses on LOOP, and create a loop_vec_info struct 2948 for it. The different analyses will record information in the 2949 loop_vec_info struct. */ 2950 opt_loop_vec_info 2951 vect_analyze_loop (class loop *loop, vec_info_shared *shared) 2952 { 2953 DUMP_VECT_SCOPE ("analyze_loop_nest"); 2954 2955 if (loop_outer (loop) 2956 && loop_vec_info_for_loop (loop_outer (loop)) 2957 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop)))) 2958 return opt_loop_vec_info::failure_at (vect_location, 2959 "outer-loop already vectorized.\n"); 2960 2961 if (!find_loop_nest (loop, &shared->loop_nest)) 2962 return opt_loop_vec_info::failure_at 2963 (vect_location, 2964 "not vectorized: loop nest containing two or more consecutive inner" 2965 " loops cannot be vectorized\n"); 2966 2967 /* Analyze the loop form. */ 2968 vect_loop_form_info loop_form_info; 2969 opt_result res = vect_analyze_loop_form (loop, &loop_form_info); 2970 if (!res) 2971 { 2972 if (dump_enabled_p ()) 2973 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2974 "bad loop form.\n"); 2975 return opt_loop_vec_info::propagate_failure (res); 2976 } 2977 if (!integer_onep (loop_form_info.assumptions)) 2978 { 2979 /* We consider to vectorize this loop by versioning it under 2980 some assumptions. In order to do this, we need to clear 2981 existing information computed by scev and niter analyzer. */ 2982 scev_reset_htab (); 2983 free_numbers_of_iterations_estimates (loop); 2984 /* Also set flag for this loop so that following scev and niter 2985 analysis are done under the assumptions. */ 2986 loop_constraint_set (loop, LOOP_C_FINITE); 2987 } 2988 2989 auto_vector_modes vector_modes; 2990 /* Autodetect first vector size we try. */ 2991 vector_modes.safe_push (VOIDmode); 2992 unsigned int autovec_flags 2993 = targetm.vectorize.autovectorize_vector_modes (&vector_modes, 2994 loop->simdlen != 0); 2995 bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS) 2996 && !unlimited_cost_model (loop)); 2997 machine_mode autodetected_vector_mode = VOIDmode; 2998 opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL); 2999 unsigned int mode_i = 0; 3000 unsigned HOST_WIDE_INT simdlen = loop->simdlen; 3001 3002 /* Keep track of the VF for each mode. Initialize all to 0 which indicates 3003 a mode has not been analyzed. */ 3004 auto_vec<poly_uint64, 8> cached_vf_per_mode; 3005 for (unsigned i = 0; i < vector_modes.length (); ++i) 3006 cached_vf_per_mode.safe_push (0); 3007 3008 /* First determine the main loop vectorization mode, either the first 3009 one that works, starting with auto-detecting the vector mode and then 3010 following the targets order of preference, or the one with the 3011 lowest cost if pick_lowest_cost_p. */ 3012 while (1) 3013 { 3014 bool fatal; 3015 unsigned int last_mode_i = mode_i; 3016 /* Set cached VF to -1 prior to analysis, which indicates a mode has 3017 failed. */ 3018 cached_vf_per_mode[last_mode_i] = -1; 3019 opt_loop_vec_info loop_vinfo 3020 = vect_analyze_loop_1 (loop, shared, &loop_form_info, 3021 NULL, vector_modes, mode_i, 3022 autodetected_vector_mode, fatal); 3023 if (fatal) 3024 break; 3025 3026 if (loop_vinfo) 3027 { 3028 /* Analyzis has been successful so update the VF value. The 3029 VF should always be a multiple of unroll_factor and we want to 3030 capture the original VF here. */ 3031 cached_vf_per_mode[last_mode_i] 3032 = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 3033 loop_vinfo->suggested_unroll_factor); 3034 /* Once we hit the desired simdlen for the first time, 3035 discard any previous attempts. */ 3036 if (simdlen 3037 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen)) 3038 { 3039 delete first_loop_vinfo; 3040 first_loop_vinfo = opt_loop_vec_info::success (NULL); 3041 simdlen = 0; 3042 } 3043 else if (pick_lowest_cost_p 3044 && first_loop_vinfo 3045 && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo)) 3046 { 3047 /* Pick loop_vinfo over first_loop_vinfo. */ 3048 delete first_loop_vinfo; 3049 first_loop_vinfo = opt_loop_vec_info::success (NULL); 3050 } 3051 if (first_loop_vinfo == NULL) 3052 first_loop_vinfo = loop_vinfo; 3053 else 3054 { 3055 delete loop_vinfo; 3056 loop_vinfo = opt_loop_vec_info::success (NULL); 3057 } 3058 3059 /* Commit to first_loop_vinfo if we have no reason to try 3060 alternatives. */ 3061 if (!simdlen && !pick_lowest_cost_p) 3062 break; 3063 } 3064 if (mode_i == vector_modes.length () 3065 || autodetected_vector_mode == VOIDmode) 3066 break; 3067 3068 /* Try the next biggest vector size. */ 3069 if (dump_enabled_p ()) 3070 dump_printf_loc (MSG_NOTE, vect_location, 3071 "***** Re-trying analysis with vector mode %s\n", 3072 GET_MODE_NAME (vector_modes[mode_i])); 3073 } 3074 if (!first_loop_vinfo) 3075 return opt_loop_vec_info::propagate_failure (res); 3076 3077 if (dump_enabled_p ()) 3078 dump_printf_loc (MSG_NOTE, vect_location, 3079 "***** Choosing vector mode %s\n", 3080 GET_MODE_NAME (first_loop_vinfo->vector_mode)); 3081 3082 /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is 3083 enabled, SIMDUID is not set, it is the innermost loop and we have 3084 either already found the loop's SIMDLEN or there was no SIMDLEN to 3085 begin with. 3086 TODO: Enable epilogue vectorization for loops with SIMDUID set. */ 3087 bool vect_epilogues = (!simdlen 3088 && loop->inner == NULL 3089 && param_vect_epilogues_nomask 3090 && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo) 3091 && !loop->simduid); 3092 if (!vect_epilogues) 3093 return first_loop_vinfo; 3094 3095 /* Now analyze first_loop_vinfo for epilogue vectorization. */ 3096 poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo); 3097 3098 /* For epilogues start the analysis from the first mode. The motivation 3099 behind starting from the beginning comes from cases where the VECTOR_MODES 3100 array may contain length-agnostic and length-specific modes. Their 3101 ordering is not guaranteed, so we could end up picking a mode for the main 3102 loop that is after the epilogue's optimal mode. */ 3103 vector_modes[0] = autodetected_vector_mode; 3104 mode_i = 0; 3105 3106 bool supports_partial_vectors = 3107 partial_vectors_supported_p () && param_vect_partial_vector_usage != 0; 3108 poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo); 3109 3110 while (1) 3111 { 3112 /* If the target does not support partial vectors we can shorten the 3113 number of modes to analyze for the epilogue as we know we can't pick a 3114 mode that would lead to a VF at least as big as the 3115 FIRST_VINFO_VF. */ 3116 if (!supports_partial_vectors 3117 && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf)) 3118 { 3119 mode_i++; 3120 if (mode_i == vector_modes.length ()) 3121 break; 3122 continue; 3123 } 3124 3125 if (dump_enabled_p ()) 3126 dump_printf_loc (MSG_NOTE, vect_location, 3127 "***** Re-trying epilogue analysis with vector " 3128 "mode %s\n", GET_MODE_NAME (vector_modes[mode_i])); 3129 3130 bool fatal; 3131 opt_loop_vec_info loop_vinfo 3132 = vect_analyze_loop_1 (loop, shared, &loop_form_info, 3133 first_loop_vinfo, 3134 vector_modes, mode_i, 3135 autodetected_vector_mode, fatal); 3136 if (fatal) 3137 break; 3138 3139 if (loop_vinfo) 3140 { 3141 if (pick_lowest_cost_p) 3142 { 3143 /* Keep trying to roll back vectorization attempts while the 3144 loop_vec_infos they produced were worse than this one. */ 3145 vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos; 3146 while (!vinfos.is_empty () 3147 && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ())) 3148 { 3149 gcc_assert (vect_epilogues); 3150 delete vinfos.pop (); 3151 } 3152 } 3153 /* For now only allow one epilogue loop. */ 3154 if (first_loop_vinfo->epilogue_vinfos.is_empty ()) 3155 { 3156 first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo); 3157 poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo); 3158 gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo) 3159 || maybe_ne (lowest_th, 0U)); 3160 /* Keep track of the known smallest versioning 3161 threshold. */ 3162 if (ordered_p (lowest_th, th)) 3163 lowest_th = ordered_min (lowest_th, th); 3164 } 3165 else 3166 { 3167 delete loop_vinfo; 3168 loop_vinfo = opt_loop_vec_info::success (NULL); 3169 } 3170 3171 /* For now only allow one epilogue loop, but allow 3172 pick_lowest_cost_p to replace it, so commit to the 3173 first epilogue if we have no reason to try alternatives. */ 3174 if (!pick_lowest_cost_p) 3175 break; 3176 } 3177 3178 if (mode_i == vector_modes.length ()) 3179 break; 3180 3181 } 3182 3183 if (!first_loop_vinfo->epilogue_vinfos.is_empty ()) 3184 { 3185 LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th; 3186 if (dump_enabled_p ()) 3187 dump_printf_loc (MSG_NOTE, vect_location, 3188 "***** Choosing epilogue vector mode %s\n", 3189 GET_MODE_NAME 3190 (first_loop_vinfo->epilogue_vinfos[0]->vector_mode)); 3191 } 3192 3193 return first_loop_vinfo; 3194 } 3195 3196 /* Return true if there is an in-order reduction function for CODE, storing 3197 it in *REDUC_FN if so. */ 3198 3199 static bool 3200 fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn) 3201 { 3202 if (code == PLUS_EXPR) 3203 { 3204 *reduc_fn = IFN_FOLD_LEFT_PLUS; 3205 return true; 3206 } 3207 return false; 3208 } 3209 3210 /* Function reduction_fn_for_scalar_code 3211 3212 Input: 3213 CODE - tree_code of a reduction operations. 3214 3215 Output: 3216 REDUC_FN - the corresponding internal function to be used to reduce the 3217 vector of partial results into a single scalar result, or IFN_LAST 3218 if the operation is a supported reduction operation, but does not have 3219 such an internal function. 3220 3221 Return FALSE if CODE currently cannot be vectorized as reduction. */ 3222 3223 bool 3224 reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn) 3225 { 3226 if (code.is_tree_code ()) 3227 switch (tree_code (code)) 3228 { 3229 case MAX_EXPR: 3230 *reduc_fn = IFN_REDUC_MAX; 3231 return true; 3232 3233 case MIN_EXPR: 3234 *reduc_fn = IFN_REDUC_MIN; 3235 return true; 3236 3237 case PLUS_EXPR: 3238 *reduc_fn = IFN_REDUC_PLUS; 3239 return true; 3240 3241 case BIT_AND_EXPR: 3242 *reduc_fn = IFN_REDUC_AND; 3243 return true; 3244 3245 case BIT_IOR_EXPR: 3246 *reduc_fn = IFN_REDUC_IOR; 3247 return true; 3248 3249 case BIT_XOR_EXPR: 3250 *reduc_fn = IFN_REDUC_XOR; 3251 return true; 3252 3253 case MULT_EXPR: 3254 case MINUS_EXPR: 3255 *reduc_fn = IFN_LAST; 3256 return true; 3257 3258 default: 3259 return false; 3260 } 3261 else 3262 switch (combined_fn (code)) 3263 { 3264 CASE_CFN_FMAX: 3265 *reduc_fn = IFN_REDUC_FMAX; 3266 return true; 3267 3268 CASE_CFN_FMIN: 3269 *reduc_fn = IFN_REDUC_FMIN; 3270 return true; 3271 3272 default: 3273 return false; 3274 } 3275 } 3276 3277 /* If there is a neutral value X such that a reduction would not be affected 3278 by the introduction of additional X elements, return that X, otherwise 3279 return null. CODE is the code of the reduction and SCALAR_TYPE is type 3280 of the scalar elements. If the reduction has just a single initial value 3281 then INITIAL_VALUE is that value, otherwise it is null. */ 3282 3283 tree 3284 neutral_op_for_reduction (tree scalar_type, code_helper code, 3285 tree initial_value) 3286 { 3287 if (code.is_tree_code ()) 3288 switch (tree_code (code)) 3289 { 3290 case WIDEN_SUM_EXPR: 3291 case DOT_PROD_EXPR: 3292 case SAD_EXPR: 3293 case PLUS_EXPR: 3294 case MINUS_EXPR: 3295 case BIT_IOR_EXPR: 3296 case BIT_XOR_EXPR: 3297 return build_zero_cst (scalar_type); 3298 3299 case MULT_EXPR: 3300 return build_one_cst (scalar_type); 3301 3302 case BIT_AND_EXPR: 3303 return build_all_ones_cst (scalar_type); 3304 3305 case MAX_EXPR: 3306 case MIN_EXPR: 3307 return initial_value; 3308 3309 default: 3310 return NULL_TREE; 3311 } 3312 else 3313 switch (combined_fn (code)) 3314 { 3315 CASE_CFN_FMIN: 3316 CASE_CFN_FMAX: 3317 return initial_value; 3318 3319 default: 3320 return NULL_TREE; 3321 } 3322 } 3323 3324 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement 3325 STMT is printed with a message MSG. */ 3326 3327 static void 3328 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg) 3329 { 3330 dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt); 3331 } 3332 3333 /* Return true if we need an in-order reduction for operation CODE 3334 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer 3335 overflow must wrap. */ 3336 3337 bool 3338 needs_fold_left_reduction_p (tree type, code_helper code) 3339 { 3340 /* CHECKME: check for !flag_finite_math_only too? */ 3341 if (SCALAR_FLOAT_TYPE_P (type)) 3342 { 3343 if (code.is_tree_code ()) 3344 switch (tree_code (code)) 3345 { 3346 case MIN_EXPR: 3347 case MAX_EXPR: 3348 return false; 3349 3350 default: 3351 return !flag_associative_math; 3352 } 3353 else 3354 switch (combined_fn (code)) 3355 { 3356 CASE_CFN_FMIN: 3357 CASE_CFN_FMAX: 3358 return false; 3359 3360 default: 3361 return !flag_associative_math; 3362 } 3363 } 3364 3365 if (INTEGRAL_TYPE_P (type)) 3366 return (!code.is_tree_code () 3367 || !operation_no_trapping_overflow (type, tree_code (code))); 3368 3369 if (SAT_FIXED_POINT_TYPE_P (type)) 3370 return true; 3371 3372 return false; 3373 } 3374 3375 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and 3376 has a handled computation expression. Store the main reduction 3377 operation in *CODE. */ 3378 3379 static bool 3380 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi, 3381 tree loop_arg, code_helper *code, 3382 vec<std::pair<ssa_op_iter, use_operand_p> > &path) 3383 { 3384 auto_bitmap visited; 3385 tree lookfor = PHI_RESULT (phi); 3386 ssa_op_iter curri; 3387 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE); 3388 while (USE_FROM_PTR (curr) != loop_arg) 3389 curr = op_iter_next_use (&curri); 3390 curri.i = curri.numops; 3391 do 3392 { 3393 path.safe_push (std::make_pair (curri, curr)); 3394 tree use = USE_FROM_PTR (curr); 3395 if (use == lookfor) 3396 break; 3397 gimple *def = SSA_NAME_DEF_STMT (use); 3398 if (gimple_nop_p (def) 3399 || ! flow_bb_inside_loop_p (loop, gimple_bb (def))) 3400 { 3401 pop: 3402 do 3403 { 3404 std::pair<ssa_op_iter, use_operand_p> x = path.pop (); 3405 curri = x.first; 3406 curr = x.second; 3407 do 3408 curr = op_iter_next_use (&curri); 3409 /* Skip already visited or non-SSA operands (from iterating 3410 over PHI args). */ 3411 while (curr != NULL_USE_OPERAND_P 3412 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME 3413 || ! bitmap_set_bit (visited, 3414 SSA_NAME_VERSION 3415 (USE_FROM_PTR (curr))))); 3416 } 3417 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ()); 3418 if (curr == NULL_USE_OPERAND_P) 3419 break; 3420 } 3421 else 3422 { 3423 if (gimple_code (def) == GIMPLE_PHI) 3424 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE); 3425 else 3426 curr = op_iter_init_use (&curri, def, SSA_OP_USE); 3427 while (curr != NULL_USE_OPERAND_P 3428 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME 3429 || ! bitmap_set_bit (visited, 3430 SSA_NAME_VERSION 3431 (USE_FROM_PTR (curr))))) 3432 curr = op_iter_next_use (&curri); 3433 if (curr == NULL_USE_OPERAND_P) 3434 goto pop; 3435 } 3436 } 3437 while (1); 3438 if (dump_file && (dump_flags & TDF_DETAILS)) 3439 { 3440 dump_printf_loc (MSG_NOTE, loc, "reduction path: "); 3441 unsigned i; 3442 std::pair<ssa_op_iter, use_operand_p> *x; 3443 FOR_EACH_VEC_ELT (path, i, x) 3444 dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second)); 3445 dump_printf (MSG_NOTE, "\n"); 3446 } 3447 3448 /* Check whether the reduction path detected is valid. */ 3449 bool fail = path.length () == 0; 3450 bool neg = false; 3451 int sign = -1; 3452 *code = ERROR_MARK; 3453 for (unsigned i = 1; i < path.length (); ++i) 3454 { 3455 gimple *use_stmt = USE_STMT (path[i].second); 3456 gimple_match_op op; 3457 if (!gimple_extract_op (use_stmt, &op)) 3458 { 3459 fail = true; 3460 break; 3461 } 3462 unsigned int opi = op.num_ops; 3463 if (gassign *assign = dyn_cast<gassign *> (use_stmt)) 3464 { 3465 /* The following make sure we can compute the operand index 3466 easily plus it mostly disallows chaining via COND_EXPR condition 3467 operands. */ 3468 for (opi = 0; opi < op.num_ops; ++opi) 3469 if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use) 3470 break; 3471 } 3472 else if (gcall *call = dyn_cast<gcall *> (use_stmt)) 3473 { 3474 for (opi = 0; opi < op.num_ops; ++opi) 3475 if (gimple_call_arg_ptr (call, opi) == path[i].second->use) 3476 break; 3477 } 3478 if (opi == op.num_ops) 3479 { 3480 fail = true; 3481 break; 3482 } 3483 op.code = canonicalize_code (op.code, op.type); 3484 if (op.code == MINUS_EXPR) 3485 { 3486 op.code = PLUS_EXPR; 3487 /* Track whether we negate the reduction value each iteration. */ 3488 if (op.ops[1] == op.ops[opi]) 3489 neg = ! neg; 3490 } 3491 if (CONVERT_EXPR_CODE_P (op.code) 3492 && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0]))) 3493 ; 3494 else if (*code == ERROR_MARK) 3495 { 3496 *code = op.code; 3497 sign = TYPE_SIGN (op.type); 3498 } 3499 else if (op.code != *code) 3500 { 3501 fail = true; 3502 break; 3503 } 3504 else if ((op.code == MIN_EXPR 3505 || op.code == MAX_EXPR) 3506 && sign != TYPE_SIGN (op.type)) 3507 { 3508 fail = true; 3509 break; 3510 } 3511 /* Check there's only a single stmt the op is used on. For the 3512 not value-changing tail and the last stmt allow out-of-loop uses. 3513 ??? We could relax this and handle arbitrary live stmts by 3514 forcing a scalar epilogue for example. */ 3515 imm_use_iterator imm_iter; 3516 gimple *op_use_stmt; 3517 unsigned cnt = 0; 3518 FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi]) 3519 if (!is_gimple_debug (op_use_stmt) 3520 && (*code != ERROR_MARK 3521 || flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt)))) 3522 { 3523 /* We want to allow x + x but not x < 1 ? x : 2. */ 3524 if (is_gimple_assign (op_use_stmt) 3525 && gimple_assign_rhs_code (op_use_stmt) == COND_EXPR) 3526 { 3527 use_operand_p use_p; 3528 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter) 3529 cnt++; 3530 } 3531 else 3532 cnt++; 3533 } 3534 if (cnt != 1) 3535 { 3536 fail = true; 3537 break; 3538 } 3539 } 3540 return ! fail && ! neg && *code != ERROR_MARK; 3541 } 3542 3543 bool 3544 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi, 3545 tree loop_arg, enum tree_code code) 3546 { 3547 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path; 3548 code_helper code_; 3549 return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path) 3550 && code_ == code); 3551 } 3552 3553 3554 3555 /* Function vect_is_simple_reduction 3556 3557 (1) Detect a cross-iteration def-use cycle that represents a simple 3558 reduction computation. We look for the following pattern: 3559 3560 loop_header: 3561 a1 = phi < a0, a2 > 3562 a3 = ... 3563 a2 = operation (a3, a1) 3564 3565 or 3566 3567 a3 = ... 3568 loop_header: 3569 a1 = phi < a0, a2 > 3570 a2 = operation (a3, a1) 3571 3572 such that: 3573 1. operation is commutative and associative and it is safe to 3574 change the order of the computation 3575 2. no uses for a2 in the loop (a2 is used out of the loop) 3576 3. no uses of a1 in the loop besides the reduction operation 3577 4. no uses of a1 outside the loop. 3578 3579 Conditions 1,4 are tested here. 3580 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized. 3581 3582 (2) Detect a cross-iteration def-use cycle in nested loops, i.e., 3583 nested cycles. 3584 3585 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double 3586 reductions: 3587 3588 a1 = phi < a0, a2 > 3589 inner loop (def of a3) 3590 a2 = phi < a3 > 3591 3592 (4) Detect condition expressions, ie: 3593 for (int i = 0; i < N; i++) 3594 if (a[i] < val) 3595 ret_val = a[i]; 3596 3597 */ 3598 3599 static stmt_vec_info 3600 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info, 3601 bool *double_reduc, bool *reduc_chain_p) 3602 { 3603 gphi *phi = as_a <gphi *> (phi_info->stmt); 3604 gimple *phi_use_stmt = NULL; 3605 imm_use_iterator imm_iter; 3606 use_operand_p use_p; 3607 3608 *double_reduc = false; 3609 *reduc_chain_p = false; 3610 STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION; 3611 3612 tree phi_name = PHI_RESULT (phi); 3613 /* ??? If there are no uses of the PHI result the inner loop reduction 3614 won't be detected as possibly double-reduction by vectorizable_reduction 3615 because that tries to walk the PHI arg from the preheader edge which 3616 can be constant. See PR60382. */ 3617 if (has_zero_uses (phi_name)) 3618 return NULL; 3619 class loop *loop = (gimple_bb (phi))->loop_father; 3620 unsigned nphi_def_loop_uses = 0; 3621 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name) 3622 { 3623 gimple *use_stmt = USE_STMT (use_p); 3624 if (is_gimple_debug (use_stmt)) 3625 continue; 3626 3627 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))) 3628 { 3629 if (dump_enabled_p ()) 3630 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3631 "intermediate value used outside loop.\n"); 3632 3633 return NULL; 3634 } 3635 3636 nphi_def_loop_uses++; 3637 phi_use_stmt = use_stmt; 3638 } 3639 3640 tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop)); 3641 if (TREE_CODE (latch_def) != SSA_NAME) 3642 { 3643 if (dump_enabled_p ()) 3644 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3645 "reduction: not ssa_name: %T\n", latch_def); 3646 return NULL; 3647 } 3648 3649 stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def); 3650 if (!def_stmt_info 3651 || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))) 3652 return NULL; 3653 3654 bool nested_in_vect_loop 3655 = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop); 3656 unsigned nlatch_def_loop_uses = 0; 3657 auto_vec<gphi *, 3> lcphis; 3658 bool inner_loop_of_double_reduc = false; 3659 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def) 3660 { 3661 gimple *use_stmt = USE_STMT (use_p); 3662 if (is_gimple_debug (use_stmt)) 3663 continue; 3664 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))) 3665 nlatch_def_loop_uses++; 3666 else 3667 { 3668 /* We can have more than one loop-closed PHI. */ 3669 lcphis.safe_push (as_a <gphi *> (use_stmt)); 3670 if (nested_in_vect_loop 3671 && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt)) 3672 == vect_double_reduction_def)) 3673 inner_loop_of_double_reduc = true; 3674 } 3675 } 3676 3677 /* If we are vectorizing an inner reduction we are executing that 3678 in the original order only in case we are not dealing with a 3679 double reduction. */ 3680 if (nested_in_vect_loop && !inner_loop_of_double_reduc) 3681 { 3682 if (dump_enabled_p ()) 3683 report_vect_op (MSG_NOTE, def_stmt_info->stmt, 3684 "detected nested cycle: "); 3685 return def_stmt_info; 3686 } 3687 3688 /* When the inner loop of a double reduction ends up with more than 3689 one loop-closed PHI we have failed to classify alternate such 3690 PHIs as double reduction, leading to wrong code. See PR103237. */ 3691 if (inner_loop_of_double_reduc && lcphis.length () != 1) 3692 { 3693 if (dump_enabled_p ()) 3694 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3695 "unhandle double reduction\n"); 3696 return NULL; 3697 } 3698 3699 /* If this isn't a nested cycle or if the nested cycle reduction value 3700 is used ouside of the inner loop we cannot handle uses of the reduction 3701 value. */ 3702 if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1) 3703 { 3704 if (dump_enabled_p ()) 3705 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3706 "reduction used in loop.\n"); 3707 return NULL; 3708 } 3709 3710 /* If DEF_STMT is a phi node itself, we expect it to have a single argument 3711 defined in the inner loop. */ 3712 if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt)) 3713 { 3714 tree op1 = PHI_ARG_DEF (def_stmt, 0); 3715 if (gimple_phi_num_args (def_stmt) != 1 3716 || TREE_CODE (op1) != SSA_NAME) 3717 { 3718 if (dump_enabled_p ()) 3719 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3720 "unsupported phi node definition.\n"); 3721 3722 return NULL; 3723 } 3724 3725 gimple *def1 = SSA_NAME_DEF_STMT (op1); 3726 if (gimple_bb (def1) 3727 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)) 3728 && loop->inner 3729 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1)) 3730 && (is_gimple_assign (def1) || is_gimple_call (def1)) 3731 && is_a <gphi *> (phi_use_stmt) 3732 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt))) 3733 { 3734 if (dump_enabled_p ()) 3735 report_vect_op (MSG_NOTE, def_stmt, 3736 "detected double reduction: "); 3737 3738 *double_reduc = true; 3739 return def_stmt_info; 3740 } 3741 3742 return NULL; 3743 } 3744 3745 /* Look for the expression computing latch_def from then loop PHI result. */ 3746 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path; 3747 code_helper code; 3748 if (check_reduction_path (vect_location, loop, phi, latch_def, &code, 3749 path)) 3750 { 3751 STMT_VINFO_REDUC_CODE (phi_info) = code; 3752 if (code == COND_EXPR && !nested_in_vect_loop) 3753 STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION; 3754 3755 /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP 3756 reduction chain for which the additional restriction is that 3757 all operations in the chain are the same. */ 3758 auto_vec<stmt_vec_info, 8> reduc_chain; 3759 unsigned i; 3760 bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR; 3761 for (i = path.length () - 1; i >= 1; --i) 3762 { 3763 gimple *stmt = USE_STMT (path[i].second); 3764 stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt); 3765 gimple_match_op op; 3766 if (!gimple_extract_op (stmt, &op)) 3767 gcc_unreachable (); 3768 if (gassign *assign = dyn_cast<gassign *> (stmt)) 3769 STMT_VINFO_REDUC_IDX (stmt_info) 3770 = path[i].second->use - gimple_assign_rhs1_ptr (assign); 3771 else 3772 { 3773 gcall *call = as_a<gcall *> (stmt); 3774 STMT_VINFO_REDUC_IDX (stmt_info) 3775 = path[i].second->use - gimple_call_arg_ptr (call, 0); 3776 } 3777 bool leading_conversion = (CONVERT_EXPR_CODE_P (op.code) 3778 && (i == 1 || i == path.length () - 1)); 3779 if ((op.code != code && !leading_conversion) 3780 /* We can only handle the final value in epilogue 3781 generation for reduction chains. */ 3782 || (i != 1 && !has_single_use (gimple_get_lhs (stmt)))) 3783 is_slp_reduc = false; 3784 /* For reduction chains we support a trailing/leading 3785 conversions. We do not store those in the actual chain. */ 3786 if (leading_conversion) 3787 continue; 3788 reduc_chain.safe_push (stmt_info); 3789 } 3790 if (is_slp_reduc && reduc_chain.length () > 1) 3791 { 3792 for (unsigned i = 0; i < reduc_chain.length () - 1; ++i) 3793 { 3794 REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0]; 3795 REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1]; 3796 } 3797 REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0]; 3798 REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL; 3799 3800 /* Save the chain for further analysis in SLP detection. */ 3801 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]); 3802 REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length (); 3803 3804 *reduc_chain_p = true; 3805 if (dump_enabled_p ()) 3806 dump_printf_loc (MSG_NOTE, vect_location, 3807 "reduction: detected reduction chain\n"); 3808 } 3809 else if (dump_enabled_p ()) 3810 dump_printf_loc (MSG_NOTE, vect_location, 3811 "reduction: detected reduction\n"); 3812 3813 return def_stmt_info; 3814 } 3815 3816 if (dump_enabled_p ()) 3817 dump_printf_loc (MSG_NOTE, vect_location, 3818 "reduction: unknown pattern\n"); 3819 3820 return NULL; 3821 } 3822 3823 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO. 3824 PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations, 3825 or -1 if not known. */ 3826 3827 static int 3828 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue) 3829 { 3830 int assumed_vf = vect_vf_for_cost (loop_vinfo); 3831 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1) 3832 { 3833 if (dump_enabled_p ()) 3834 dump_printf_loc (MSG_NOTE, vect_location, 3835 "cost model: epilogue peel iters set to vf/2 " 3836 "because loop iterations are unknown .\n"); 3837 return assumed_vf / 2; 3838 } 3839 else 3840 { 3841 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo); 3842 peel_iters_prologue = MIN (niters, peel_iters_prologue); 3843 int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf; 3844 /* If we need to peel for gaps, but no peeling is required, we have to 3845 peel VF iterations. */ 3846 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue) 3847 peel_iters_epilogue = assumed_vf; 3848 return peel_iters_epilogue; 3849 } 3850 } 3851 3852 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */ 3853 int 3854 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue, 3855 int *peel_iters_epilogue, 3856 stmt_vector_for_cost *scalar_cost_vec, 3857 stmt_vector_for_cost *prologue_cost_vec, 3858 stmt_vector_for_cost *epilogue_cost_vec) 3859 { 3860 int retval = 0; 3861 3862 *peel_iters_epilogue 3863 = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue); 3864 3865 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) 3866 { 3867 /* If peeled iterations are known but number of scalar loop 3868 iterations are unknown, count a taken branch per peeled loop. */ 3869 if (peel_iters_prologue > 0) 3870 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken, 3871 vect_prologue); 3872 if (*peel_iters_epilogue > 0) 3873 retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken, 3874 vect_epilogue); 3875 } 3876 3877 stmt_info_for_cost *si; 3878 int j; 3879 if (peel_iters_prologue) 3880 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si) 3881 retval += record_stmt_cost (prologue_cost_vec, 3882 si->count * peel_iters_prologue, 3883 si->kind, si->stmt_info, si->misalign, 3884 vect_prologue); 3885 if (*peel_iters_epilogue) 3886 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si) 3887 retval += record_stmt_cost (epilogue_cost_vec, 3888 si->count * *peel_iters_epilogue, 3889 si->kind, si->stmt_info, si->misalign, 3890 vect_epilogue); 3891 3892 return retval; 3893 } 3894 3895 /* Function vect_estimate_min_profitable_iters 3896 3897 Return the number of iterations required for the vector version of the 3898 loop to be profitable relative to the cost of the scalar version of the 3899 loop. 3900 3901 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold 3902 of iterations for vectorization. -1 value means loop vectorization 3903 is not profitable. This returned value may be used for dynamic 3904 profitability check. 3905 3906 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used 3907 for static check against estimated number of iterations. */ 3908 3909 static void 3910 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo, 3911 int *ret_min_profitable_niters, 3912 int *ret_min_profitable_estimate, 3913 unsigned *suggested_unroll_factor) 3914 { 3915 int min_profitable_iters; 3916 int min_profitable_estimate; 3917 int peel_iters_prologue; 3918 int peel_iters_epilogue; 3919 unsigned vec_inside_cost = 0; 3920 int vec_outside_cost = 0; 3921 unsigned vec_prologue_cost = 0; 3922 unsigned vec_epilogue_cost = 0; 3923 int scalar_single_iter_cost = 0; 3924 int scalar_outside_cost = 0; 3925 int assumed_vf = vect_vf_for_cost (loop_vinfo); 3926 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); 3927 vector_costs *target_cost_data = loop_vinfo->vector_costs; 3928 3929 /* Cost model disabled. */ 3930 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo))) 3931 { 3932 if (dump_enabled_p ()) 3933 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n"); 3934 *ret_min_profitable_niters = 0; 3935 *ret_min_profitable_estimate = 0; 3936 return; 3937 } 3938 3939 /* Requires loop versioning tests to handle misalignment. */ 3940 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)) 3941 { 3942 /* FIXME: Make cost depend on complexity of individual check. */ 3943 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length (); 3944 (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue); 3945 if (dump_enabled_p ()) 3946 dump_printf (MSG_NOTE, 3947 "cost model: Adding cost of checks for loop " 3948 "versioning to treat misalignment.\n"); 3949 } 3950 3951 /* Requires loop versioning with alias checks. */ 3952 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)) 3953 { 3954 /* FIXME: Make cost depend on complexity of individual check. */ 3955 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length (); 3956 (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue); 3957 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length (); 3958 if (len) 3959 /* Count LEN - 1 ANDs and LEN comparisons. */ 3960 (void) add_stmt_cost (target_cost_data, len * 2 - 1, 3961 scalar_stmt, vect_prologue); 3962 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length (); 3963 if (len) 3964 { 3965 /* Count LEN - 1 ANDs and LEN comparisons. */ 3966 unsigned int nstmts = len * 2 - 1; 3967 /* +1 for each bias that needs adding. */ 3968 for (unsigned int i = 0; i < len; ++i) 3969 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p) 3970 nstmts += 1; 3971 (void) add_stmt_cost (target_cost_data, nstmts, 3972 scalar_stmt, vect_prologue); 3973 } 3974 if (dump_enabled_p ()) 3975 dump_printf (MSG_NOTE, 3976 "cost model: Adding cost of checks for loop " 3977 "versioning aliasing.\n"); 3978 } 3979 3980 /* Requires loop versioning with niter checks. */ 3981 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo)) 3982 { 3983 /* FIXME: Make cost depend on complexity of individual check. */ 3984 (void) add_stmt_cost (target_cost_data, 1, vector_stmt, 3985 NULL, NULL, NULL_TREE, 0, vect_prologue); 3986 if (dump_enabled_p ()) 3987 dump_printf (MSG_NOTE, 3988 "cost model: Adding cost of checks for loop " 3989 "versioning niters.\n"); 3990 } 3991 3992 if (LOOP_REQUIRES_VERSIONING (loop_vinfo)) 3993 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, 3994 vect_prologue); 3995 3996 /* Count statements in scalar loop. Using this as scalar cost for a single 3997 iteration for now. 3998 3999 TODO: Add outer loop support. 4000 4001 TODO: Consider assigning different costs to different scalar 4002 statements. */ 4003 4004 scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost (); 4005 4006 /* Add additional cost for the peeled instructions in prologue and epilogue 4007 loop. (For fully-masked loops there will be no peeling.) 4008 4009 FORNOW: If we don't know the value of peel_iters for prologue or epilogue 4010 at compile-time - we assume it's vf/2 (the worst would be vf-1). 4011 4012 TODO: Build an expression that represents peel_iters for prologue and 4013 epilogue to be used in a run-time test. */ 4014 4015 bool prologue_need_br_taken_cost = false; 4016 bool prologue_need_br_not_taken_cost = false; 4017 4018 /* Calculate peel_iters_prologue. */ 4019 if (vect_use_loop_mask_for_alignment_p (loop_vinfo)) 4020 peel_iters_prologue = 0; 4021 else if (npeel < 0) 4022 { 4023 peel_iters_prologue = assumed_vf / 2; 4024 if (dump_enabled_p ()) 4025 dump_printf (MSG_NOTE, "cost model: " 4026 "prologue peel iters set to vf/2.\n"); 4027 4028 /* If peeled iterations are unknown, count a taken branch and a not taken 4029 branch per peeled loop. Even if scalar loop iterations are known, 4030 vector iterations are not known since peeled prologue iterations are 4031 not known. Hence guards remain the same. */ 4032 prologue_need_br_taken_cost = true; 4033 prologue_need_br_not_taken_cost = true; 4034 } 4035 else 4036 { 4037 peel_iters_prologue = npeel; 4038 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0) 4039 /* If peeled iterations are known but number of scalar loop 4040 iterations are unknown, count a taken branch per peeled loop. */ 4041 prologue_need_br_taken_cost = true; 4042 } 4043 4044 bool epilogue_need_br_taken_cost = false; 4045 bool epilogue_need_br_not_taken_cost = false; 4046 4047 /* Calculate peel_iters_epilogue. */ 4048 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)) 4049 /* We need to peel exactly one iteration for gaps. */ 4050 peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0; 4051 else if (npeel < 0) 4052 { 4053 /* If peeling for alignment is unknown, loop bound of main loop 4054 becomes unknown. */ 4055 peel_iters_epilogue = assumed_vf / 2; 4056 if (dump_enabled_p ()) 4057 dump_printf (MSG_NOTE, "cost model: " 4058 "epilogue peel iters set to vf/2 because " 4059 "peeling for alignment is unknown.\n"); 4060 4061 /* See the same reason above in peel_iters_prologue calculation. */ 4062 epilogue_need_br_taken_cost = true; 4063 epilogue_need_br_not_taken_cost = true; 4064 } 4065 else 4066 { 4067 peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel); 4068 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0) 4069 /* If peeled iterations are known but number of scalar loop 4070 iterations are unknown, count a taken branch per peeled loop. */ 4071 epilogue_need_br_taken_cost = true; 4072 } 4073 4074 stmt_info_for_cost *si; 4075 int j; 4076 /* Add costs associated with peel_iters_prologue. */ 4077 if (peel_iters_prologue) 4078 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si) 4079 { 4080 (void) add_stmt_cost (target_cost_data, 4081 si->count * peel_iters_prologue, si->kind, 4082 si->stmt_info, si->node, si->vectype, 4083 si->misalign, vect_prologue); 4084 } 4085 4086 /* Add costs associated with peel_iters_epilogue. */ 4087 if (peel_iters_epilogue) 4088 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si) 4089 { 4090 (void) add_stmt_cost (target_cost_data, 4091 si->count * peel_iters_epilogue, si->kind, 4092 si->stmt_info, si->node, si->vectype, 4093 si->misalign, vect_epilogue); 4094 } 4095 4096 /* Add possible cond_branch_taken/cond_branch_not_taken cost. */ 4097 4098 if (prologue_need_br_taken_cost) 4099 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, 4100 vect_prologue); 4101 4102 if (prologue_need_br_not_taken_cost) 4103 (void) add_stmt_cost (target_cost_data, 1, 4104 cond_branch_not_taken, vect_prologue); 4105 4106 if (epilogue_need_br_taken_cost) 4107 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, 4108 vect_epilogue); 4109 4110 if (epilogue_need_br_not_taken_cost) 4111 (void) add_stmt_cost (target_cost_data, 1, 4112 cond_branch_not_taken, vect_epilogue); 4113 4114 /* Take care of special costs for rgroup controls of partial vectors. */ 4115 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) 4116 { 4117 /* Calculate how many masks we need to generate. */ 4118 unsigned int num_masks = 0; 4119 rgroup_controls *rgm; 4120 unsigned int num_vectors_m1; 4121 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm) 4122 if (rgm->type) 4123 num_masks += num_vectors_m1 + 1; 4124 gcc_assert (num_masks > 0); 4125 4126 /* In the worst case, we need to generate each mask in the prologue 4127 and in the loop body. One of the loop body mask instructions 4128 replaces the comparison in the scalar loop, and since we don't 4129 count the scalar comparison against the scalar body, we shouldn't 4130 count that vector instruction against the vector body either. 4131 4132 Sometimes we can use unpacks instead of generating prologue 4133 masks and sometimes the prologue mask will fold to a constant, 4134 so the actual prologue cost might be smaller. However, it's 4135 simpler and safer to use the worst-case cost; if this ends up 4136 being the tie-breaker between vectorizing or not, then it's 4137 probably better not to vectorize. */ 4138 (void) add_stmt_cost (target_cost_data, num_masks, 4139 vector_stmt, NULL, NULL, NULL_TREE, 0, 4140 vect_prologue); 4141 (void) add_stmt_cost (target_cost_data, num_masks - 1, 4142 vector_stmt, NULL, NULL, NULL_TREE, 0, 4143 vect_body); 4144 } 4145 else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)) 4146 { 4147 /* Referring to the functions vect_set_loop_condition_partial_vectors 4148 and vect_set_loop_controls_directly, we need to generate each 4149 length in the prologue and in the loop body if required. Although 4150 there are some possible optimizations, we consider the worst case 4151 here. */ 4152 4153 bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo); 4154 signed char partial_load_store_bias 4155 = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo); 4156 bool need_iterate_p 4157 = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo) 4158 && !vect_known_niters_smaller_than_vf (loop_vinfo)); 4159 4160 /* Calculate how many statements to be added. */ 4161 unsigned int prologue_stmts = 0; 4162 unsigned int body_stmts = 0; 4163 4164 rgroup_controls *rgc; 4165 unsigned int num_vectors_m1; 4166 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc) 4167 if (rgc->type) 4168 { 4169 /* May need one SHIFT for nitems_total computation. */ 4170 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor; 4171 if (nitems != 1 && !niters_known_p) 4172 prologue_stmts += 1; 4173 4174 /* May need one MAX and one MINUS for wrap around. */ 4175 if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc)) 4176 prologue_stmts += 2; 4177 4178 /* Need one MAX and one MINUS for each batch limit excepting for 4179 the 1st one. */ 4180 prologue_stmts += num_vectors_m1 * 2; 4181 4182 unsigned int num_vectors = num_vectors_m1 + 1; 4183 4184 /* Need to set up lengths in prologue, only one MIN required 4185 for each since start index is zero. */ 4186 prologue_stmts += num_vectors; 4187 4188 /* If we have a non-zero partial load bias, we need one PLUS 4189 to adjust the load length. */ 4190 if (partial_load_store_bias != 0) 4191 body_stmts += 1; 4192 4193 /* Each may need two MINs and one MINUS to update lengths in body 4194 for next iteration. */ 4195 if (need_iterate_p) 4196 body_stmts += 3 * num_vectors; 4197 } 4198 4199 (void) add_stmt_cost (target_cost_data, prologue_stmts, 4200 scalar_stmt, vect_prologue); 4201 (void) add_stmt_cost (target_cost_data, body_stmts, 4202 scalar_stmt, vect_body); 4203 } 4204 4205 /* FORNOW: The scalar outside cost is incremented in one of the 4206 following ways: 4207 4208 1. The vectorizer checks for alignment and aliasing and generates 4209 a condition that allows dynamic vectorization. A cost model 4210 check is ANDED with the versioning condition. Hence scalar code 4211 path now has the added cost of the versioning check. 4212 4213 if (cost > th & versioning_check) 4214 jmp to vector code 4215 4216 Hence run-time scalar is incremented by not-taken branch cost. 4217 4218 2. The vectorizer then checks if a prologue is required. If the 4219 cost model check was not done before during versioning, it has to 4220 be done before the prologue check. 4221 4222 if (cost <= th) 4223 prologue = scalar_iters 4224 if (prologue == 0) 4225 jmp to vector code 4226 else 4227 execute prologue 4228 if (prologue == num_iters) 4229 go to exit 4230 4231 Hence the run-time scalar cost is incremented by a taken branch, 4232 plus a not-taken branch, plus a taken branch cost. 4233 4234 3. The vectorizer then checks if an epilogue is required. If the 4235 cost model check was not done before during prologue check, it 4236 has to be done with the epilogue check. 4237 4238 if (prologue == 0) 4239 jmp to vector code 4240 else 4241 execute prologue 4242 if (prologue == num_iters) 4243 go to exit 4244 vector code: 4245 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0)) 4246 jmp to epilogue 4247 4248 Hence the run-time scalar cost should be incremented by 2 taken 4249 branches. 4250 4251 TODO: The back end may reorder the BBS's differently and reverse 4252 conditions/branch directions. Change the estimates below to 4253 something more reasonable. */ 4254 4255 /* If the number of iterations is known and we do not do versioning, we can 4256 decide whether to vectorize at compile time. Hence the scalar version 4257 do not carry cost model guard costs. */ 4258 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) 4259 || LOOP_REQUIRES_VERSIONING (loop_vinfo)) 4260 { 4261 /* Cost model check occurs at versioning. */ 4262 if (LOOP_REQUIRES_VERSIONING (loop_vinfo)) 4263 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken); 4264 else 4265 { 4266 /* Cost model check occurs at prologue generation. */ 4267 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0) 4268 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken) 4269 + vect_get_stmt_cost (cond_branch_not_taken); 4270 /* Cost model check occurs at epilogue generation. */ 4271 else 4272 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken); 4273 } 4274 } 4275 4276 /* Complete the target-specific cost calculations. */ 4277 finish_cost (loop_vinfo->vector_costs, loop_vinfo->scalar_costs, 4278 &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost, 4279 suggested_unroll_factor); 4280 4281 if (suggested_unroll_factor && *suggested_unroll_factor > 1 4282 && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR 4283 && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) * 4284 *suggested_unroll_factor, 4285 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo))) 4286 { 4287 if (dump_enabled_p ()) 4288 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 4289 "can't unroll as unrolled vectorization factor larger" 4290 " than maximum vectorization factor: " 4291 HOST_WIDE_INT_PRINT_UNSIGNED "\n", 4292 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)); 4293 *suggested_unroll_factor = 1; 4294 } 4295 4296 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost); 4297 4298 if (dump_enabled_p ()) 4299 { 4300 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n"); 4301 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n", 4302 vec_inside_cost); 4303 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", 4304 vec_prologue_cost); 4305 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", 4306 vec_epilogue_cost); 4307 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n", 4308 scalar_single_iter_cost); 4309 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n", 4310 scalar_outside_cost); 4311 dump_printf (MSG_NOTE, " Vector outside cost: %d\n", 4312 vec_outside_cost); 4313 dump_printf (MSG_NOTE, " prologue iterations: %d\n", 4314 peel_iters_prologue); 4315 dump_printf (MSG_NOTE, " epilogue iterations: %d\n", 4316 peel_iters_epilogue); 4317 } 4318 4319 /* Calculate number of iterations required to make the vector version 4320 profitable, relative to the loop bodies only. The following condition 4321 must hold true: 4322 SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC 4323 where 4324 SIC = scalar iteration cost, VIC = vector iteration cost, 4325 VOC = vector outside cost, VF = vectorization factor, 4326 NPEEL = prologue iterations + epilogue iterations, 4327 SOC = scalar outside cost for run time cost model check. */ 4328 4329 int saving_per_viter = (scalar_single_iter_cost * assumed_vf 4330 - vec_inside_cost); 4331 if (saving_per_viter <= 0) 4332 { 4333 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize) 4334 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd, 4335 "vectorization did not happen for a simd loop"); 4336 4337 if (dump_enabled_p ()) 4338 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 4339 "cost model: the vector iteration cost = %d " 4340 "divided by the scalar iteration cost = %d " 4341 "is greater or equal to the vectorization factor = %d" 4342 ".\n", 4343 vec_inside_cost, scalar_single_iter_cost, assumed_vf); 4344 *ret_min_profitable_niters = -1; 4345 *ret_min_profitable_estimate = -1; 4346 return; 4347 } 4348 4349 /* ??? The "if" arm is written to handle all cases; see below for what 4350 we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */ 4351 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)) 4352 { 4353 /* Rewriting the condition above in terms of the number of 4354 vector iterations (vniters) rather than the number of 4355 scalar iterations (niters) gives: 4356 4357 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC 4358 4359 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC 4360 4361 For integer N, X and Y when X > 0: 4362 4363 N * X > Y <==> N >= (Y /[floor] X) + 1. */ 4364 int outside_overhead = (vec_outside_cost 4365 - scalar_single_iter_cost * peel_iters_prologue 4366 - scalar_single_iter_cost * peel_iters_epilogue 4367 - scalar_outside_cost); 4368 /* We're only interested in cases that require at least one 4369 vector iteration. */ 4370 int min_vec_niters = 1; 4371 if (outside_overhead > 0) 4372 min_vec_niters = outside_overhead / saving_per_viter + 1; 4373 4374 if (dump_enabled_p ()) 4375 dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n", 4376 min_vec_niters); 4377 4378 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)) 4379 { 4380 /* Now that we know the minimum number of vector iterations, 4381 find the minimum niters for which the scalar cost is larger: 4382 4383 SIC * niters > VIC * vniters + VOC - SOC 4384 4385 We know that the minimum niters is no more than 4386 vniters * VF + NPEEL, but it might be (and often is) less 4387 than that if a partial vector iteration is cheaper than the 4388 equivalent scalar code. */ 4389 int threshold = (vec_inside_cost * min_vec_niters 4390 + vec_outside_cost 4391 - scalar_outside_cost); 4392 if (threshold <= 0) 4393 min_profitable_iters = 1; 4394 else 4395 min_profitable_iters = threshold / scalar_single_iter_cost + 1; 4396 } 4397 else 4398 /* Convert the number of vector iterations into a number of 4399 scalar iterations. */ 4400 min_profitable_iters = (min_vec_niters * assumed_vf 4401 + peel_iters_prologue 4402 + peel_iters_epilogue); 4403 } 4404 else 4405 { 4406 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) 4407 * assumed_vf 4408 - vec_inside_cost * peel_iters_prologue 4409 - vec_inside_cost * peel_iters_epilogue); 4410 if (min_profitable_iters <= 0) 4411 min_profitable_iters = 0; 4412 else 4413 { 4414 min_profitable_iters /= saving_per_viter; 4415 4416 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters) 4417 <= (((int) vec_inside_cost * min_profitable_iters) 4418 + (((int) vec_outside_cost - scalar_outside_cost) 4419 * assumed_vf))) 4420 min_profitable_iters++; 4421 } 4422 } 4423 4424 if (dump_enabled_p ()) 4425 dump_printf (MSG_NOTE, 4426 " Calculated minimum iters for profitability: %d\n", 4427 min_profitable_iters); 4428 4429 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) 4430 && min_profitable_iters < (assumed_vf + peel_iters_prologue)) 4431 /* We want the vectorized loop to execute at least once. */ 4432 min_profitable_iters = assumed_vf + peel_iters_prologue; 4433 else if (min_profitable_iters < peel_iters_prologue) 4434 /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the 4435 vectorized loop executes at least once. */ 4436 min_profitable_iters = peel_iters_prologue; 4437 4438 if (dump_enabled_p ()) 4439 dump_printf_loc (MSG_NOTE, vect_location, 4440 " Runtime profitability threshold = %d\n", 4441 min_profitable_iters); 4442 4443 *ret_min_profitable_niters = min_profitable_iters; 4444 4445 /* Calculate number of iterations required to make the vector version 4446 profitable, relative to the loop bodies only. 4447 4448 Non-vectorized variant is SIC * niters and it must win over vector 4449 variant on the expected loop trip count. The following condition must hold true: 4450 SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */ 4451 4452 if (vec_outside_cost <= 0) 4453 min_profitable_estimate = 0; 4454 /* ??? This "else if" arm is written to handle all cases; see below for 4455 what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */ 4456 else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)) 4457 { 4458 /* This is a repeat of the code above, but with + SOC rather 4459 than - SOC. */ 4460 int outside_overhead = (vec_outside_cost 4461 - scalar_single_iter_cost * peel_iters_prologue 4462 - scalar_single_iter_cost * peel_iters_epilogue 4463 + scalar_outside_cost); 4464 int min_vec_niters = 1; 4465 if (outside_overhead > 0) 4466 min_vec_niters = outside_overhead / saving_per_viter + 1; 4467 4468 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)) 4469 { 4470 int threshold = (vec_inside_cost * min_vec_niters 4471 + vec_outside_cost 4472 + scalar_outside_cost); 4473 min_profitable_estimate = threshold / scalar_single_iter_cost + 1; 4474 } 4475 else 4476 min_profitable_estimate = (min_vec_niters * assumed_vf 4477 + peel_iters_prologue 4478 + peel_iters_epilogue); 4479 } 4480 else 4481 { 4482 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) 4483 * assumed_vf 4484 - vec_inside_cost * peel_iters_prologue 4485 - vec_inside_cost * peel_iters_epilogue) 4486 / ((scalar_single_iter_cost * assumed_vf) 4487 - vec_inside_cost); 4488 } 4489 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters); 4490 if (dump_enabled_p ()) 4491 dump_printf_loc (MSG_NOTE, vect_location, 4492 " Static estimate profitability threshold = %d\n", 4493 min_profitable_estimate); 4494 4495 *ret_min_profitable_estimate = min_profitable_estimate; 4496 } 4497 4498 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET 4499 vector elements (not bits) for a vector with NELT elements. */ 4500 static void 4501 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt, 4502 vec_perm_builder *sel) 4503 { 4504 /* The encoding is a single stepped pattern. Any wrap-around is handled 4505 by vec_perm_indices. */ 4506 sel->new_vector (nelt, 1, 3); 4507 for (unsigned int i = 0; i < 3; i++) 4508 sel->quick_push (i + offset); 4509 } 4510 4511 /* Checks whether the target supports whole-vector shifts for vectors of mode 4512 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_ 4513 it supports vec_perm_const with masks for all necessary shift amounts. */ 4514 static bool 4515 have_whole_vector_shift (machine_mode mode) 4516 { 4517 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing) 4518 return true; 4519 4520 /* Variable-length vectors should be handled via the optab. */ 4521 unsigned int nelt; 4522 if (!GET_MODE_NUNITS (mode).is_constant (&nelt)) 4523 return false; 4524 4525 vec_perm_builder sel; 4526 vec_perm_indices indices; 4527 for (unsigned int i = nelt / 2; i >= 1; i /= 2) 4528 { 4529 calc_vec_perm_mask_for_shift (i, nelt, &sel); 4530 indices.new_vector (sel, 2, nelt); 4531 if (!can_vec_perm_const_p (mode, indices, false)) 4532 return false; 4533 } 4534 return true; 4535 } 4536 4537 /* TODO: Close dependency between vect_model_*_cost and vectorizable_* 4538 functions. Design better to avoid maintenance issues. */ 4539 4540 /* Function vect_model_reduction_cost. 4541 4542 Models cost for a reduction operation, including the vector ops 4543 generated within the strip-mine loop in some cases, the initial 4544 definition before the loop, and the epilogue code that must be generated. */ 4545 4546 static void 4547 vect_model_reduction_cost (loop_vec_info loop_vinfo, 4548 stmt_vec_info stmt_info, internal_fn reduc_fn, 4549 vect_reduction_type reduction_type, 4550 int ncopies, stmt_vector_for_cost *cost_vec) 4551 { 4552 int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0; 4553 tree vectype; 4554 machine_mode mode; 4555 class loop *loop = NULL; 4556 4557 if (loop_vinfo) 4558 loop = LOOP_VINFO_LOOP (loop_vinfo); 4559 4560 /* Condition reductions generate two reductions in the loop. */ 4561 if (reduction_type == COND_REDUCTION) 4562 ncopies *= 2; 4563 4564 vectype = STMT_VINFO_VECTYPE (stmt_info); 4565 mode = TYPE_MODE (vectype); 4566 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info); 4567 4568 gimple_match_op op; 4569 if (!gimple_extract_op (orig_stmt_info->stmt, &op)) 4570 gcc_unreachable (); 4571 4572 if (reduction_type == EXTRACT_LAST_REDUCTION) 4573 /* No extra instructions are needed in the prologue. The loop body 4574 operations are costed in vectorizable_condition. */ 4575 inside_cost = 0; 4576 else if (reduction_type == FOLD_LEFT_REDUCTION) 4577 { 4578 /* No extra instructions needed in the prologue. */ 4579 prologue_cost = 0; 4580 4581 if (reduc_fn != IFN_LAST) 4582 /* Count one reduction-like operation per vector. */ 4583 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar, 4584 stmt_info, 0, vect_body); 4585 else 4586 { 4587 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */ 4588 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype); 4589 inside_cost = record_stmt_cost (cost_vec, nelements, 4590 vec_to_scalar, stmt_info, 0, 4591 vect_body); 4592 inside_cost += record_stmt_cost (cost_vec, nelements, 4593 scalar_stmt, stmt_info, 0, 4594 vect_body); 4595 } 4596 } 4597 else 4598 { 4599 /* Add in cost for initial definition. 4600 For cond reduction we have four vectors: initial index, step, 4601 initial result of the data reduction, initial value of the index 4602 reduction. */ 4603 int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1; 4604 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts, 4605 scalar_to_vec, stmt_info, 0, 4606 vect_prologue); 4607 } 4608 4609 /* Determine cost of epilogue code. 4610 4611 We have a reduction operator that will reduce the vector in one statement. 4612 Also requires scalar extract. */ 4613 4614 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info)) 4615 { 4616 if (reduc_fn != IFN_LAST) 4617 { 4618 if (reduction_type == COND_REDUCTION) 4619 { 4620 /* An EQ stmt and an COND_EXPR stmt. */ 4621 epilogue_cost += record_stmt_cost (cost_vec, 2, 4622 vector_stmt, stmt_info, 0, 4623 vect_epilogue); 4624 /* Reduction of the max index and a reduction of the found 4625 values. */ 4626 epilogue_cost += record_stmt_cost (cost_vec, 2, 4627 vec_to_scalar, stmt_info, 0, 4628 vect_epilogue); 4629 /* A broadcast of the max value. */ 4630 epilogue_cost += record_stmt_cost (cost_vec, 1, 4631 scalar_to_vec, stmt_info, 0, 4632 vect_epilogue); 4633 } 4634 else 4635 { 4636 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt, 4637 stmt_info, 0, vect_epilogue); 4638 epilogue_cost += record_stmt_cost (cost_vec, 1, 4639 vec_to_scalar, stmt_info, 0, 4640 vect_epilogue); 4641 } 4642 } 4643 else if (reduction_type == COND_REDUCTION) 4644 { 4645 unsigned estimated_nunits = vect_nunits_for_cost (vectype); 4646 /* Extraction of scalar elements. */ 4647 epilogue_cost += record_stmt_cost (cost_vec, 4648 2 * estimated_nunits, 4649 vec_to_scalar, stmt_info, 0, 4650 vect_epilogue); 4651 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */ 4652 epilogue_cost += record_stmt_cost (cost_vec, 4653 2 * estimated_nunits - 3, 4654 scalar_stmt, stmt_info, 0, 4655 vect_epilogue); 4656 } 4657 else if (reduction_type == EXTRACT_LAST_REDUCTION 4658 || reduction_type == FOLD_LEFT_REDUCTION) 4659 /* No extra instructions need in the epilogue. */ 4660 ; 4661 else 4662 { 4663 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype)); 4664 tree bitsize = TYPE_SIZE (op.type); 4665 int element_bitsize = tree_to_uhwi (bitsize); 4666 int nelements = vec_size_in_bits / element_bitsize; 4667 4668 if (op.code == COND_EXPR) 4669 op.code = MAX_EXPR; 4670 4671 /* We have a whole vector shift available. */ 4672 if (VECTOR_MODE_P (mode) 4673 && directly_supported_p (op.code, vectype) 4674 && have_whole_vector_shift (mode)) 4675 { 4676 /* Final reduction via vector shifts and the reduction operator. 4677 Also requires scalar extract. */ 4678 epilogue_cost += record_stmt_cost (cost_vec, 4679 exact_log2 (nelements) * 2, 4680 vector_stmt, stmt_info, 0, 4681 vect_epilogue); 4682 epilogue_cost += record_stmt_cost (cost_vec, 1, 4683 vec_to_scalar, stmt_info, 0, 4684 vect_epilogue); 4685 } 4686 else 4687 /* Use extracts and reduction op for final reduction. For N 4688 elements, we have N extracts and N-1 reduction ops. */ 4689 epilogue_cost += record_stmt_cost (cost_vec, 4690 nelements + nelements - 1, 4691 vector_stmt, stmt_info, 0, 4692 vect_epilogue); 4693 } 4694 } 4695 4696 if (dump_enabled_p ()) 4697 dump_printf (MSG_NOTE, 4698 "vect_model_reduction_cost: inside_cost = %d, " 4699 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost, 4700 prologue_cost, epilogue_cost); 4701 } 4702 4703 /* SEQ is a sequence of instructions that initialize the reduction 4704 described by REDUC_INFO. Emit them in the appropriate place. */ 4705 4706 static void 4707 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo, 4708 stmt_vec_info reduc_info, gimple *seq) 4709 { 4710 if (reduc_info->reused_accumulator) 4711 { 4712 /* When reusing an accumulator from the main loop, we only need 4713 initialization instructions if the main loop can be skipped. 4714 In that case, emit the initialization instructions at the end 4715 of the guard block that does the skip. */ 4716 edge skip_edge = loop_vinfo->skip_main_loop_edge; 4717 gcc_assert (skip_edge); 4718 gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src); 4719 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT); 4720 } 4721 else 4722 { 4723 /* The normal case: emit the initialization instructions on the 4724 preheader edge. */ 4725 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 4726 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq); 4727 } 4728 } 4729 4730 /* Function get_initial_def_for_reduction 4731 4732 Input: 4733 REDUC_INFO - the info_for_reduction 4734 INIT_VAL - the initial value of the reduction variable 4735 NEUTRAL_OP - a value that has no effect on the reduction, as per 4736 neutral_op_for_reduction 4737 4738 Output: 4739 Return a vector variable, initialized according to the operation that 4740 STMT_VINFO performs. This vector will be used as the initial value 4741 of the vector of partial results. 4742 4743 The value we need is a vector in which element 0 has value INIT_VAL 4744 and every other element has value NEUTRAL_OP. */ 4745 4746 static tree 4747 get_initial_def_for_reduction (loop_vec_info loop_vinfo, 4748 stmt_vec_info reduc_info, 4749 tree init_val, tree neutral_op) 4750 { 4751 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 4752 tree scalar_type = TREE_TYPE (init_val); 4753 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type); 4754 tree init_def; 4755 gimple_seq stmts = NULL; 4756 4757 gcc_assert (vectype); 4758 4759 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type) 4760 || SCALAR_FLOAT_TYPE_P (scalar_type)); 4761 4762 gcc_assert (nested_in_vect_loop_p (loop, reduc_info) 4763 || loop == (gimple_bb (reduc_info->stmt))->loop_father); 4764 4765 if (operand_equal_p (init_val, neutral_op)) 4766 { 4767 /* If both elements are equal then the vector described above is 4768 just a splat. */ 4769 neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op); 4770 init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op); 4771 } 4772 else 4773 { 4774 neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op); 4775 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val); 4776 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ()) 4777 { 4778 /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into 4779 element 0. */ 4780 init_def = gimple_build_vector_from_val (&stmts, vectype, 4781 neutral_op); 4782 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT, 4783 vectype, init_def, init_val); 4784 } 4785 else 4786 { 4787 /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}. */ 4788 tree_vector_builder elts (vectype, 1, 2); 4789 elts.quick_push (init_val); 4790 elts.quick_push (neutral_op); 4791 init_def = gimple_build_vector (&stmts, &elts); 4792 } 4793 } 4794 4795 if (stmts) 4796 vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts); 4797 return init_def; 4798 } 4799 4800 /* Get at the initial defs for the reduction PHIs for REDUC_INFO, 4801 which performs a reduction involving GROUP_SIZE scalar statements. 4802 NUMBER_OF_VECTORS is the number of vector defs to create. If NEUTRAL_OP 4803 is nonnull, introducing extra elements of that value will not change the 4804 result. */ 4805 4806 static void 4807 get_initial_defs_for_reduction (loop_vec_info loop_vinfo, 4808 stmt_vec_info reduc_info, 4809 vec<tree> *vec_oprnds, 4810 unsigned int number_of_vectors, 4811 unsigned int group_size, tree neutral_op) 4812 { 4813 vec<tree> &initial_values = reduc_info->reduc_initial_values; 4814 unsigned HOST_WIDE_INT nunits; 4815 unsigned j, number_of_places_left_in_vector; 4816 tree vector_type = STMT_VINFO_VECTYPE (reduc_info); 4817 unsigned int i; 4818 4819 gcc_assert (group_size == initial_values.length () || neutral_op); 4820 4821 /* NUMBER_OF_COPIES is the number of times we need to use the same values in 4822 created vectors. It is greater than 1 if unrolling is performed. 4823 4824 For example, we have two scalar operands, s1 and s2 (e.g., group of 4825 strided accesses of size two), while NUNITS is four (i.e., four scalars 4826 of this type can be packed in a vector). The output vector will contain 4827 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES 4828 will be 2). 4829 4830 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several 4831 vectors containing the operands. 4832 4833 For example, NUNITS is four as before, and the group size is 8 4834 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and 4835 {s5, s6, s7, s8}. */ 4836 4837 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits)) 4838 nunits = group_size; 4839 4840 number_of_places_left_in_vector = nunits; 4841 bool constant_p = true; 4842 tree_vector_builder elts (vector_type, nunits, 1); 4843 elts.quick_grow (nunits); 4844 gimple_seq ctor_seq = NULL; 4845 for (j = 0; j < nunits * number_of_vectors; ++j) 4846 { 4847 tree op; 4848 i = j % group_size; 4849 4850 /* Get the def before the loop. In reduction chain we have only 4851 one initial value. Else we have as many as PHIs in the group. */ 4852 if (i >= initial_values.length () || (j > i && neutral_op)) 4853 op = neutral_op; 4854 else 4855 op = initial_values[i]; 4856 4857 /* Create 'vect_ = {op0,op1,...,opn}'. */ 4858 number_of_places_left_in_vector--; 4859 elts[nunits - number_of_places_left_in_vector - 1] = op; 4860 if (!CONSTANT_CLASS_P (op)) 4861 constant_p = false; 4862 4863 if (number_of_places_left_in_vector == 0) 4864 { 4865 tree init; 4866 if (constant_p && !neutral_op 4867 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits) 4868 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits)) 4869 /* Build the vector directly from ELTS. */ 4870 init = gimple_build_vector (&ctor_seq, &elts); 4871 else if (neutral_op) 4872 { 4873 /* Build a vector of the neutral value and shift the 4874 other elements into place. */ 4875 init = gimple_build_vector_from_val (&ctor_seq, vector_type, 4876 neutral_op); 4877 int k = nunits; 4878 while (k > 0 && elts[k - 1] == neutral_op) 4879 k -= 1; 4880 while (k > 0) 4881 { 4882 k -= 1; 4883 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT, 4884 vector_type, init, elts[k]); 4885 } 4886 } 4887 else 4888 { 4889 /* First time round, duplicate ELTS to fill the 4890 required number of vectors. */ 4891 duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type, 4892 elts, number_of_vectors, *vec_oprnds); 4893 break; 4894 } 4895 vec_oprnds->quick_push (init); 4896 4897 number_of_places_left_in_vector = nunits; 4898 elts.new_vector (vector_type, nunits, 1); 4899 elts.quick_grow (nunits); 4900 constant_p = true; 4901 } 4902 } 4903 if (ctor_seq != NULL) 4904 vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq); 4905 } 4906 4907 /* For a statement STMT_INFO taking part in a reduction operation return 4908 the stmt_vec_info the meta information is stored on. */ 4909 4910 stmt_vec_info 4911 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info) 4912 { 4913 stmt_info = vect_orig_stmt (stmt_info); 4914 gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info)); 4915 if (!is_a <gphi *> (stmt_info->stmt) 4916 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info))) 4917 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info); 4918 gphi *phi = as_a <gphi *> (stmt_info->stmt); 4919 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def) 4920 { 4921 if (gimple_phi_num_args (phi) == 1) 4922 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info); 4923 } 4924 else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle) 4925 { 4926 stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi)); 4927 if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def) 4928 stmt_info = info; 4929 } 4930 return stmt_info; 4931 } 4932 4933 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that 4934 REDUC_INFO can build on. Adjust REDUC_INFO and return true if so, otherwise 4935 return false. */ 4936 4937 static bool 4938 vect_find_reusable_accumulator (loop_vec_info loop_vinfo, 4939 stmt_vec_info reduc_info) 4940 { 4941 loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo); 4942 if (!main_loop_vinfo) 4943 return false; 4944 4945 if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION) 4946 return false; 4947 4948 unsigned int num_phis = reduc_info->reduc_initial_values.length (); 4949 auto_vec<tree, 16> main_loop_results (num_phis); 4950 auto_vec<tree, 16> initial_values (num_phis); 4951 if (edge main_loop_edge = loop_vinfo->main_loop_edge) 4952 { 4953 /* The epilogue loop can be entered either from the main loop or 4954 from an earlier guard block. */ 4955 edge skip_edge = loop_vinfo->skip_main_loop_edge; 4956 for (tree incoming_value : reduc_info->reduc_initial_values) 4957 { 4958 /* Look for: 4959 4960 INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop), 4961 INITIAL_VALUE(guard block)>. */ 4962 gcc_assert (TREE_CODE (incoming_value) == SSA_NAME); 4963 4964 gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value)); 4965 gcc_assert (gimple_bb (phi) == main_loop_edge->dest); 4966 4967 tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge); 4968 tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge); 4969 4970 main_loop_results.quick_push (from_main_loop); 4971 initial_values.quick_push (from_skip); 4972 } 4973 } 4974 else 4975 /* The main loop dominates the epilogue loop. */ 4976 main_loop_results.splice (reduc_info->reduc_initial_values); 4977 4978 /* See if the main loop has the kind of accumulator we need. */ 4979 vect_reusable_accumulator *accumulator 4980 = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]); 4981 if (!accumulator 4982 || num_phis != accumulator->reduc_info->reduc_scalar_results.length () 4983 || !std::equal (main_loop_results.begin (), main_loop_results.end (), 4984 accumulator->reduc_info->reduc_scalar_results.begin ())) 4985 return false; 4986 4987 /* Handle the case where we can reduce wider vectors to narrower ones. */ 4988 tree vectype = STMT_VINFO_VECTYPE (reduc_info); 4989 tree old_vectype = TREE_TYPE (accumulator->reduc_input); 4990 unsigned HOST_WIDE_INT m; 4991 if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype), 4992 TYPE_VECTOR_SUBPARTS (vectype), &m)) 4993 return false; 4994 /* Check the intermediate vector types and operations are available. */ 4995 tree prev_vectype = old_vectype; 4996 poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype); 4997 while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype))) 4998 { 4999 intermediate_nunits = exact_div (intermediate_nunits, 2); 5000 tree intermediate_vectype = get_related_vectype_for_scalar_type 5001 (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits); 5002 if (!intermediate_vectype 5003 || !directly_supported_p (STMT_VINFO_REDUC_CODE (reduc_info), 5004 intermediate_vectype) 5005 || !can_vec_extract (TYPE_MODE (prev_vectype), 5006 TYPE_MODE (intermediate_vectype))) 5007 return false; 5008 prev_vectype = intermediate_vectype; 5009 } 5010 5011 /* Non-SLP reductions might apply an adjustment after the reduction 5012 operation, in order to simplify the initialization of the accumulator. 5013 If the epilogue loop carries on from where the main loop left off, 5014 it should apply the same adjustment to the final reduction result. 5015 5016 If the epilogue loop can also be entered directly (rather than via 5017 the main loop), we need to be able to handle that case in the same way, 5018 with the same adjustment. (In principle we could add a PHI node 5019 to select the correct adjustment, but in practice that shouldn't be 5020 necessary.) */ 5021 tree main_adjustment 5022 = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info); 5023 if (loop_vinfo->main_loop_edge && main_adjustment) 5024 { 5025 gcc_assert (num_phis == 1); 5026 tree initial_value = initial_values[0]; 5027 /* Check that we can use INITIAL_VALUE as the adjustment and 5028 initialize the accumulator with a neutral value instead. */ 5029 if (!operand_equal_p (initial_value, main_adjustment)) 5030 return false; 5031 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info); 5032 initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value), 5033 code, initial_value); 5034 } 5035 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment; 5036 reduc_info->reduc_initial_values.truncate (0); 5037 reduc_info->reduc_initial_values.splice (initial_values); 5038 reduc_info->reused_accumulator = accumulator; 5039 return true; 5040 } 5041 5042 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation 5043 CODE emitting stmts before GSI. Returns a vector def of VECTYPE. */ 5044 5045 static tree 5046 vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code, 5047 gimple_seq *seq) 5048 { 5049 unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant (); 5050 unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant (); 5051 tree stype = TREE_TYPE (vectype); 5052 tree new_temp = vec_def; 5053 while (nunits > nunits1) 5054 { 5055 nunits /= 2; 5056 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype), 5057 stype, nunits); 5058 unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1)); 5059 5060 /* The target has to make sure we support lowpart/highpart 5061 extraction, either via direct vector extract or through 5062 an integer mode punning. */ 5063 tree dst1, dst2; 5064 gimple *epilog_stmt; 5065 if (convert_optab_handler (vec_extract_optab, 5066 TYPE_MODE (TREE_TYPE (new_temp)), 5067 TYPE_MODE (vectype1)) 5068 != CODE_FOR_nothing) 5069 { 5070 /* Extract sub-vectors directly once vec_extract becomes 5071 a conversion optab. */ 5072 dst1 = make_ssa_name (vectype1); 5073 epilog_stmt 5074 = gimple_build_assign (dst1, BIT_FIELD_REF, 5075 build3 (BIT_FIELD_REF, vectype1, 5076 new_temp, TYPE_SIZE (vectype1), 5077 bitsize_int (0))); 5078 gimple_seq_add_stmt_without_update (seq, epilog_stmt); 5079 dst2 = make_ssa_name (vectype1); 5080 epilog_stmt 5081 = gimple_build_assign (dst2, BIT_FIELD_REF, 5082 build3 (BIT_FIELD_REF, vectype1, 5083 new_temp, TYPE_SIZE (vectype1), 5084 bitsize_int (bitsize))); 5085 gimple_seq_add_stmt_without_update (seq, epilog_stmt); 5086 } 5087 else 5088 { 5089 /* Extract via punning to appropriately sized integer mode 5090 vector. */ 5091 tree eltype = build_nonstandard_integer_type (bitsize, 1); 5092 tree etype = build_vector_type (eltype, 2); 5093 gcc_assert (convert_optab_handler (vec_extract_optab, 5094 TYPE_MODE (etype), 5095 TYPE_MODE (eltype)) 5096 != CODE_FOR_nothing); 5097 tree tem = make_ssa_name (etype); 5098 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR, 5099 build1 (VIEW_CONVERT_EXPR, 5100 etype, new_temp)); 5101 gimple_seq_add_stmt_without_update (seq, epilog_stmt); 5102 new_temp = tem; 5103 tem = make_ssa_name (eltype); 5104 epilog_stmt 5105 = gimple_build_assign (tem, BIT_FIELD_REF, 5106 build3 (BIT_FIELD_REF, eltype, 5107 new_temp, TYPE_SIZE (eltype), 5108 bitsize_int (0))); 5109 gimple_seq_add_stmt_without_update (seq, epilog_stmt); 5110 dst1 = make_ssa_name (vectype1); 5111 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR, 5112 build1 (VIEW_CONVERT_EXPR, 5113 vectype1, tem)); 5114 gimple_seq_add_stmt_without_update (seq, epilog_stmt); 5115 tem = make_ssa_name (eltype); 5116 epilog_stmt 5117 = gimple_build_assign (tem, BIT_FIELD_REF, 5118 build3 (BIT_FIELD_REF, eltype, 5119 new_temp, TYPE_SIZE (eltype), 5120 bitsize_int (bitsize))); 5121 gimple_seq_add_stmt_without_update (seq, epilog_stmt); 5122 dst2 = make_ssa_name (vectype1); 5123 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR, 5124 build1 (VIEW_CONVERT_EXPR, 5125 vectype1, tem)); 5126 gimple_seq_add_stmt_without_update (seq, epilog_stmt); 5127 } 5128 5129 new_temp = gimple_build (seq, code, vectype1, dst1, dst2); 5130 } 5131 5132 return new_temp; 5133 } 5134 5135 /* Function vect_create_epilog_for_reduction 5136 5137 Create code at the loop-epilog to finalize the result of a reduction 5138 computation. 5139 5140 STMT_INFO is the scalar reduction stmt that is being vectorized. 5141 SLP_NODE is an SLP node containing a group of reduction statements. The 5142 first one in this group is STMT_INFO. 5143 SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE 5144 REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi 5145 (counting from 0) 5146 5147 This function: 5148 1. Completes the reduction def-use cycles. 5149 2. "Reduces" each vector of partial results VECT_DEFS into a single result, 5150 by calling the function specified by REDUC_FN if available, or by 5151 other means (whole-vector shifts or a scalar loop). 5152 The function also creates a new phi node at the loop exit to preserve 5153 loop-closed form, as illustrated below. 5154 5155 The flow at the entry to this function: 5156 5157 loop: 5158 vec_def = phi <vec_init, null> # REDUCTION_PHI 5159 VECT_DEF = vector_stmt # vectorized form of STMT_INFO 5160 s_loop = scalar_stmt # (scalar) STMT_INFO 5161 loop_exit: 5162 s_out0 = phi <s_loop> # (scalar) EXIT_PHI 5163 use <s_out0> 5164 use <s_out0> 5165 5166 The above is transformed by this function into: 5167 5168 loop: 5169 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI 5170 VECT_DEF = vector_stmt # vectorized form of STMT_INFO 5171 s_loop = scalar_stmt # (scalar) STMT_INFO 5172 loop_exit: 5173 s_out0 = phi <s_loop> # (scalar) EXIT_PHI 5174 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI 5175 v_out2 = reduce <v_out1> 5176 s_out3 = extract_field <v_out2, 0> 5177 s_out4 = adjust_result <s_out3> 5178 use <s_out4> 5179 use <s_out4> 5180 */ 5181 5182 static void 5183 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, 5184 stmt_vec_info stmt_info, 5185 slp_tree slp_node, 5186 slp_instance slp_node_instance) 5187 { 5188 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info); 5189 gcc_assert (reduc_info->is_reduc_info); 5190 /* For double reductions we need to get at the inner loop reduction 5191 stmt which has the meta info attached. Our stmt_info is that of the 5192 loop-closed PHI of the inner loop which we remember as 5193 def for the reduction PHI generation. */ 5194 bool double_reduc = false; 5195 stmt_vec_info rdef_info = stmt_info; 5196 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def) 5197 { 5198 gcc_assert (!slp_node); 5199 double_reduc = true; 5200 stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def 5201 (stmt_info->stmt, 0)); 5202 stmt_info = vect_stmt_to_vectorize (stmt_info); 5203 } 5204 gphi *reduc_def_stmt 5205 = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt); 5206 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info); 5207 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info); 5208 tree vectype; 5209 machine_mode mode; 5210 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL; 5211 basic_block exit_bb; 5212 tree scalar_dest; 5213 tree scalar_type; 5214 gimple *new_phi = NULL, *phi; 5215 gimple_stmt_iterator exit_gsi; 5216 tree new_temp = NULL_TREE, new_name, new_scalar_dest; 5217 gimple *epilog_stmt = NULL; 5218 gimple *exit_phi; 5219 tree bitsize; 5220 tree def; 5221 tree orig_name, scalar_result; 5222 imm_use_iterator imm_iter, phi_imm_iter; 5223 use_operand_p use_p, phi_use_p; 5224 gimple *use_stmt; 5225 auto_vec<tree> reduc_inputs; 5226 int j, i; 5227 vec<tree> &scalar_results = reduc_info->reduc_scalar_results; 5228 unsigned int group_size = 1, k; 5229 auto_vec<gimple *> phis; 5230 /* SLP reduction without reduction chain, e.g., 5231 # a1 = phi <a2, a0> 5232 # b1 = phi <b2, b0> 5233 a2 = operation (a1) 5234 b2 = operation (b1) */ 5235 bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)); 5236 bool direct_slp_reduc; 5237 tree induction_index = NULL_TREE; 5238 5239 if (slp_node) 5240 group_size = SLP_TREE_LANES (slp_node); 5241 5242 if (nested_in_vect_loop_p (loop, stmt_info)) 5243 { 5244 outer_loop = loop; 5245 loop = loop->inner; 5246 gcc_assert (!slp_node && double_reduc); 5247 } 5248 5249 vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info); 5250 gcc_assert (vectype); 5251 mode = TYPE_MODE (vectype); 5252 5253 tree induc_val = NULL_TREE; 5254 tree adjustment_def = NULL; 5255 if (slp_node) 5256 ; 5257 else 5258 { 5259 /* Optimize: for induction condition reduction, if we can't use zero 5260 for induc_val, use initial_def. */ 5261 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION) 5262 induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info); 5263 else if (double_reduc) 5264 ; 5265 else 5266 adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info); 5267 } 5268 5269 stmt_vec_info single_live_out_stmt[] = { stmt_info }; 5270 array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt; 5271 if (slp_reduc) 5272 /* All statements produce live-out values. */ 5273 live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node); 5274 else if (slp_node) 5275 { 5276 /* The last statement in the reduction chain produces the live-out 5277 value. Note SLP optimization can shuffle scalar stmts to 5278 optimize permutations so we have to search for the last stmt. */ 5279 for (k = 0; k < group_size; ++k) 5280 if (!REDUC_GROUP_NEXT_ELEMENT (SLP_TREE_SCALAR_STMTS (slp_node)[k])) 5281 { 5282 single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[k]; 5283 break; 5284 } 5285 } 5286 5287 unsigned vec_num; 5288 int ncopies; 5289 if (slp_node) 5290 { 5291 vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length (); 5292 ncopies = 1; 5293 } 5294 else 5295 { 5296 stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt); 5297 vec_num = 1; 5298 ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length (); 5299 } 5300 5301 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR) 5302 which is updated with the current index of the loop for every match of 5303 the original loop's cond_expr (VEC_STMT). This results in a vector 5304 containing the last time the condition passed for that vector lane. 5305 The first match will be a 1 to allow 0 to be used for non-matching 5306 indexes. If there are no matches at all then the vector will be all 5307 zeroes. 5308 5309 PR92772: This algorithm is broken for architectures that support 5310 masked vectors, but do not provide fold_extract_last. */ 5311 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION) 5312 { 5313 auto_vec<std::pair<tree, bool>, 2> ccompares; 5314 stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info); 5315 cond_info = vect_stmt_to_vectorize (cond_info); 5316 while (cond_info != reduc_info) 5317 { 5318 if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR) 5319 { 5320 gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0]; 5321 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR); 5322 ccompares.safe_push 5323 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)), 5324 STMT_VINFO_REDUC_IDX (cond_info) == 2)); 5325 } 5326 cond_info 5327 = loop_vinfo->lookup_def (gimple_op (cond_info->stmt, 5328 1 + STMT_VINFO_REDUC_IDX 5329 (cond_info))); 5330 cond_info = vect_stmt_to_vectorize (cond_info); 5331 } 5332 gcc_assert (ccompares.length () != 0); 5333 5334 tree indx_before_incr, indx_after_incr; 5335 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype); 5336 int scalar_precision 5337 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype))); 5338 tree cr_index_scalar_type = make_unsigned_type (scalar_precision); 5339 tree cr_index_vector_type = get_related_vectype_for_scalar_type 5340 (TYPE_MODE (vectype), cr_index_scalar_type, 5341 TYPE_VECTOR_SUBPARTS (vectype)); 5342 5343 /* First we create a simple vector induction variable which starts 5344 with the values {1,2,3,...} (SERIES_VECT) and increments by the 5345 vector size (STEP). */ 5346 5347 /* Create a {1,2,3,...} vector. */ 5348 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1); 5349 5350 /* Create a vector of the step value. */ 5351 tree step = build_int_cst (cr_index_scalar_type, nunits_out); 5352 tree vec_step = build_vector_from_val (cr_index_vector_type, step); 5353 5354 /* Create an induction variable. */ 5355 gimple_stmt_iterator incr_gsi; 5356 bool insert_after; 5357 standard_iv_increment_position (loop, &incr_gsi, &insert_after); 5358 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi, 5359 insert_after, &indx_before_incr, &indx_after_incr); 5360 5361 /* Next create a new phi node vector (NEW_PHI_TREE) which starts 5362 filled with zeros (VEC_ZERO). */ 5363 5364 /* Create a vector of 0s. */ 5365 tree zero = build_zero_cst (cr_index_scalar_type); 5366 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero); 5367 5368 /* Create a vector phi node. */ 5369 tree new_phi_tree = make_ssa_name (cr_index_vector_type); 5370 new_phi = create_phi_node (new_phi_tree, loop->header); 5371 add_phi_arg (as_a <gphi *> (new_phi), vec_zero, 5372 loop_preheader_edge (loop), UNKNOWN_LOCATION); 5373 5374 /* Now take the condition from the loops original cond_exprs 5375 and produce a new cond_exprs (INDEX_COND_EXPR) which for 5376 every match uses values from the induction variable 5377 (INDEX_BEFORE_INCR) otherwise uses values from the phi node 5378 (NEW_PHI_TREE). 5379 Finally, we update the phi (NEW_PHI_TREE) to take the value of 5380 the new cond_expr (INDEX_COND_EXPR). */ 5381 gimple_seq stmts = NULL; 5382 for (int i = ccompares.length () - 1; i != -1; --i) 5383 { 5384 tree ccompare = ccompares[i].first; 5385 if (ccompares[i].second) 5386 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR, 5387 cr_index_vector_type, 5388 ccompare, 5389 indx_before_incr, new_phi_tree); 5390 else 5391 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR, 5392 cr_index_vector_type, 5393 ccompare, 5394 new_phi_tree, indx_before_incr); 5395 } 5396 gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT); 5397 5398 /* Update the phi with the vec cond. */ 5399 induction_index = new_phi_tree; 5400 add_phi_arg (as_a <gphi *> (new_phi), induction_index, 5401 loop_latch_edge (loop), UNKNOWN_LOCATION); 5402 } 5403 5404 /* 2. Create epilog code. 5405 The reduction epilog code operates across the elements of the vector 5406 of partial results computed by the vectorized loop. 5407 The reduction epilog code consists of: 5408 5409 step 1: compute the scalar result in a vector (v_out2) 5410 step 2: extract the scalar result (s_out3) from the vector (v_out2) 5411 step 3: adjust the scalar result (s_out3) if needed. 5412 5413 Step 1 can be accomplished using one the following three schemes: 5414 (scheme 1) using reduc_fn, if available. 5415 (scheme 2) using whole-vector shifts, if available. 5416 (scheme 3) using a scalar loop. In this case steps 1+2 above are 5417 combined. 5418 5419 The overall epilog code looks like this: 5420 5421 s_out0 = phi <s_loop> # original EXIT_PHI 5422 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI 5423 v_out2 = reduce <v_out1> # step 1 5424 s_out3 = extract_field <v_out2, 0> # step 2 5425 s_out4 = adjust_result <s_out3> # step 3 5426 5427 (step 3 is optional, and steps 1 and 2 may be combined). 5428 Lastly, the uses of s_out0 are replaced by s_out4. */ 5429 5430 5431 /* 2.1 Create new loop-exit-phis to preserve loop-closed form: 5432 v_out1 = phi <VECT_DEF> 5433 Store them in NEW_PHIS. */ 5434 if (double_reduc) 5435 loop = outer_loop; 5436 exit_bb = single_exit (loop)->dest; 5437 exit_gsi = gsi_after_labels (exit_bb); 5438 reduc_inputs.create (slp_node ? vec_num : ncopies); 5439 for (unsigned i = 0; i < vec_num; i++) 5440 { 5441 gimple_seq stmts = NULL; 5442 if (slp_node) 5443 def = vect_get_slp_vect_def (slp_node, i); 5444 else 5445 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]); 5446 for (j = 0; j < ncopies; j++) 5447 { 5448 tree new_def = copy_ssa_name (def); 5449 phi = create_phi_node (new_def, exit_bb); 5450 if (j) 5451 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]); 5452 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def); 5453 new_def = gimple_convert (&stmts, vectype, new_def); 5454 reduc_inputs.quick_push (new_def); 5455 } 5456 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); 5457 } 5458 5459 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3 5460 (i.e. when reduc_fn is not available) and in the final adjustment 5461 code (if needed). Also get the original scalar reduction variable as 5462 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it 5463 represents a reduction pattern), the tree-code and scalar-def are 5464 taken from the original stmt that the pattern-stmt (STMT) replaces. 5465 Otherwise (it is a regular reduction) - the tree-code and scalar-def 5466 are taken from STMT. */ 5467 5468 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info); 5469 if (orig_stmt_info != stmt_info) 5470 { 5471 /* Reduction pattern */ 5472 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)); 5473 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info); 5474 } 5475 5476 scalar_dest = gimple_get_lhs (orig_stmt_info->stmt); 5477 scalar_type = TREE_TYPE (scalar_dest); 5478 scalar_results.truncate (0); 5479 scalar_results.reserve_exact (group_size); 5480 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL); 5481 bitsize = TYPE_SIZE (scalar_type); 5482 5483 /* True if we should implement SLP_REDUC using native reduction operations 5484 instead of scalar operations. */ 5485 direct_slp_reduc = (reduc_fn != IFN_LAST 5486 && slp_reduc 5487 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ()); 5488 5489 /* In case of reduction chain, e.g., 5490 # a1 = phi <a3, a0> 5491 a2 = operation (a1) 5492 a3 = operation (a2), 5493 5494 we may end up with more than one vector result. Here we reduce them 5495 to one vector. 5496 5497 The same is true if we couldn't use a single defuse cycle. */ 5498 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) 5499 || direct_slp_reduc 5500 || ncopies > 1) 5501 { 5502 gimple_seq stmts = NULL; 5503 tree single_input = reduc_inputs[0]; 5504 for (k = 1; k < reduc_inputs.length (); k++) 5505 single_input = gimple_build (&stmts, code, vectype, 5506 single_input, reduc_inputs[k]); 5507 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); 5508 5509 reduc_inputs.truncate (0); 5510 reduc_inputs.safe_push (single_input); 5511 } 5512 5513 tree orig_reduc_input = reduc_inputs[0]; 5514 5515 /* If this loop is an epilogue loop that can be skipped after the 5516 main loop, we can only share a reduction operation between the 5517 main loop and the epilogue if we put it at the target of the 5518 skip edge. 5519 5520 We can still reuse accumulators if this check fails. Doing so has 5521 the minor(?) benefit of making the epilogue loop's scalar result 5522 independent of the main loop's scalar result. */ 5523 bool unify_with_main_loop_p = false; 5524 if (reduc_info->reused_accumulator 5525 && loop_vinfo->skip_this_loop_edge 5526 && single_succ_p (exit_bb) 5527 && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest) 5528 { 5529 unify_with_main_loop_p = true; 5530 5531 basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest; 5532 reduc_inputs[0] = make_ssa_name (vectype); 5533 gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block); 5534 add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb), 5535 UNKNOWN_LOCATION); 5536 add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input, 5537 loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION); 5538 exit_gsi = gsi_after_labels (reduc_block); 5539 } 5540 5541 /* Shouldn't be used beyond this point. */ 5542 exit_bb = nullptr; 5543 5544 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION 5545 && reduc_fn != IFN_LAST) 5546 { 5547 /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing 5548 various data values where the condition matched and another vector 5549 (INDUCTION_INDEX) containing all the indexes of those matches. We 5550 need to extract the last matching index (which will be the index with 5551 highest value) and use this to index into the data vector. 5552 For the case where there were no matches, the data vector will contain 5553 all default values and the index vector will be all zeros. */ 5554 5555 /* Get various versions of the type of the vector of indexes. */ 5556 tree index_vec_type = TREE_TYPE (induction_index); 5557 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type)); 5558 tree index_scalar_type = TREE_TYPE (index_vec_type); 5559 tree index_vec_cmp_type = truth_type_for (index_vec_type); 5560 5561 /* Get an unsigned integer version of the type of the data vector. */ 5562 int scalar_precision 5563 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type)); 5564 tree scalar_type_unsigned = make_unsigned_type (scalar_precision); 5565 tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned, 5566 vectype); 5567 5568 /* First we need to create a vector (ZERO_VEC) of zeros and another 5569 vector (MAX_INDEX_VEC) filled with the last matching index, which we 5570 can create using a MAX reduction and then expanding. 5571 In the case where the loop never made any matches, the max index will 5572 be zero. */ 5573 5574 /* Vector of {0, 0, 0,...}. */ 5575 tree zero_vec = build_zero_cst (vectype); 5576 5577 /* Find maximum value from the vector of found indexes. */ 5578 tree max_index = make_ssa_name (index_scalar_type); 5579 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX, 5580 1, induction_index); 5581 gimple_call_set_lhs (max_index_stmt, max_index); 5582 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT); 5583 5584 /* Vector of {max_index, max_index, max_index,...}. */ 5585 tree max_index_vec = make_ssa_name (index_vec_type); 5586 tree max_index_vec_rhs = build_vector_from_val (index_vec_type, 5587 max_index); 5588 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec, 5589 max_index_vec_rhs); 5590 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT); 5591 5592 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes 5593 with the vector (INDUCTION_INDEX) of found indexes, choosing values 5594 from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC) 5595 otherwise. Only one value should match, resulting in a vector 5596 (VEC_COND) with one data value and the rest zeros. 5597 In the case where the loop never made any matches, every index will 5598 match, resulting in a vector with all data values (which will all be 5599 the default value). */ 5600 5601 /* Compare the max index vector to the vector of found indexes to find 5602 the position of the max value. */ 5603 tree vec_compare = make_ssa_name (index_vec_cmp_type); 5604 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR, 5605 induction_index, 5606 max_index_vec); 5607 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT); 5608 5609 /* Use the compare to choose either values from the data vector or 5610 zero. */ 5611 tree vec_cond = make_ssa_name (vectype); 5612 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR, 5613 vec_compare, 5614 reduc_inputs[0], 5615 zero_vec); 5616 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT); 5617 5618 /* Finally we need to extract the data value from the vector (VEC_COND) 5619 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR 5620 reduction, but because this doesn't exist, we can use a MAX reduction 5621 instead. The data value might be signed or a float so we need to cast 5622 it first. 5623 In the case where the loop never made any matches, the data values are 5624 all identical, and so will reduce down correctly. */ 5625 5626 /* Make the matched data values unsigned. */ 5627 tree vec_cond_cast = make_ssa_name (vectype_unsigned); 5628 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned, 5629 vec_cond); 5630 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast, 5631 VIEW_CONVERT_EXPR, 5632 vec_cond_cast_rhs); 5633 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT); 5634 5635 /* Reduce down to a scalar value. */ 5636 tree data_reduc = make_ssa_name (scalar_type_unsigned); 5637 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX, 5638 1, vec_cond_cast); 5639 gimple_call_set_lhs (data_reduc_stmt, data_reduc); 5640 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT); 5641 5642 /* Convert the reduced value back to the result type and set as the 5643 result. */ 5644 gimple_seq stmts = NULL; 5645 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type, 5646 data_reduc); 5647 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); 5648 scalar_results.safe_push (new_temp); 5649 } 5650 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION 5651 && reduc_fn == IFN_LAST) 5652 { 5653 /* Condition reduction without supported IFN_REDUC_MAX. Generate 5654 idx = 0; 5655 idx_val = induction_index[0]; 5656 val = data_reduc[0]; 5657 for (idx = 0, val = init, i = 0; i < nelts; ++i) 5658 if (induction_index[i] > idx_val) 5659 val = data_reduc[i], idx_val = induction_index[i]; 5660 return val; */ 5661 5662 tree data_eltype = TREE_TYPE (vectype); 5663 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index)); 5664 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype)); 5665 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index)); 5666 /* Enforced by vectorizable_reduction, which ensures we have target 5667 support before allowing a conditional reduction on variable-length 5668 vectors. */ 5669 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant (); 5670 tree idx_val = NULL_TREE, val = NULL_TREE; 5671 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size) 5672 { 5673 tree old_idx_val = idx_val; 5674 tree old_val = val; 5675 idx_val = make_ssa_name (idx_eltype); 5676 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF, 5677 build3 (BIT_FIELD_REF, idx_eltype, 5678 induction_index, 5679 bitsize_int (el_size), 5680 bitsize_int (off))); 5681 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5682 val = make_ssa_name (data_eltype); 5683 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF, 5684 build3 (BIT_FIELD_REF, 5685 data_eltype, 5686 reduc_inputs[0], 5687 bitsize_int (el_size), 5688 bitsize_int (off))); 5689 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5690 if (off != 0) 5691 { 5692 tree new_idx_val = idx_val; 5693 if (off != v_size - el_size) 5694 { 5695 new_idx_val = make_ssa_name (idx_eltype); 5696 epilog_stmt = gimple_build_assign (new_idx_val, 5697 MAX_EXPR, idx_val, 5698 old_idx_val); 5699 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5700 } 5701 tree new_val = make_ssa_name (data_eltype); 5702 epilog_stmt = gimple_build_assign (new_val, 5703 COND_EXPR, 5704 build2 (GT_EXPR, 5705 boolean_type_node, 5706 idx_val, 5707 old_idx_val), 5708 val, old_val); 5709 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5710 idx_val = new_idx_val; 5711 val = new_val; 5712 } 5713 } 5714 /* Convert the reduced value back to the result type and set as the 5715 result. */ 5716 gimple_seq stmts = NULL; 5717 val = gimple_convert (&stmts, scalar_type, val); 5718 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); 5719 scalar_results.safe_push (val); 5720 } 5721 5722 /* 2.3 Create the reduction code, using one of the three schemes described 5723 above. In SLP we simply need to extract all the elements from the 5724 vector (without reducing them), so we use scalar shifts. */ 5725 else if (reduc_fn != IFN_LAST && !slp_reduc) 5726 { 5727 tree tmp; 5728 tree vec_elem_type; 5729 5730 /* Case 1: Create: 5731 v_out2 = reduc_expr <v_out1> */ 5732 5733 if (dump_enabled_p ()) 5734 dump_printf_loc (MSG_NOTE, vect_location, 5735 "Reduce using direct vector reduction.\n"); 5736 5737 gimple_seq stmts = NULL; 5738 vec_elem_type = TREE_TYPE (vectype); 5739 new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn), 5740 vec_elem_type, reduc_inputs[0]); 5741 new_temp = gimple_convert (&stmts, scalar_type, new_temp); 5742 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); 5743 5744 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION) 5745 && induc_val) 5746 { 5747 /* Earlier we set the initial value to be a vector if induc_val 5748 values. Check the result and if it is induc_val then replace 5749 with the original initial value, unless induc_val is 5750 the same as initial_def already. */ 5751 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp, 5752 induc_val); 5753 tree initial_def = reduc_info->reduc_initial_values[0]; 5754 5755 tmp = make_ssa_name (new_scalar_dest); 5756 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare, 5757 initial_def, new_temp); 5758 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5759 new_temp = tmp; 5760 } 5761 5762 scalar_results.safe_push (new_temp); 5763 } 5764 else if (direct_slp_reduc) 5765 { 5766 /* Here we create one vector for each of the REDUC_GROUP_SIZE results, 5767 with the elements for other SLP statements replaced with the 5768 neutral value. We can then do a normal reduction on each vector. */ 5769 5770 /* Enforced by vectorizable_reduction. */ 5771 gcc_assert (reduc_inputs.length () == 1); 5772 gcc_assert (pow2p_hwi (group_size)); 5773 5774 gimple_seq seq = NULL; 5775 5776 /* Build a vector {0, 1, 2, ...}, with the same number of elements 5777 and the same element size as VECTYPE. */ 5778 tree index = build_index_vector (vectype, 0, 1); 5779 tree index_type = TREE_TYPE (index); 5780 tree index_elt_type = TREE_TYPE (index_type); 5781 tree mask_type = truth_type_for (index_type); 5782 5783 /* Create a vector that, for each element, identifies which of 5784 the REDUC_GROUP_SIZE results should use it. */ 5785 tree index_mask = build_int_cst (index_elt_type, group_size - 1); 5786 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index, 5787 build_vector_from_val (index_type, index_mask)); 5788 5789 /* Get a neutral vector value. This is simply a splat of the neutral 5790 scalar value if we have one, otherwise the initial scalar value 5791 is itself a neutral value. */ 5792 tree vector_identity = NULL_TREE; 5793 tree neutral_op = NULL_TREE; 5794 if (slp_node) 5795 { 5796 tree initial_value = NULL_TREE; 5797 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)) 5798 initial_value = reduc_info->reduc_initial_values[0]; 5799 neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code, 5800 initial_value); 5801 } 5802 if (neutral_op) 5803 vector_identity = gimple_build_vector_from_val (&seq, vectype, 5804 neutral_op); 5805 for (unsigned int i = 0; i < group_size; ++i) 5806 { 5807 /* If there's no univeral neutral value, we can use the 5808 initial scalar value from the original PHI. This is used 5809 for MIN and MAX reduction, for example. */ 5810 if (!neutral_op) 5811 { 5812 tree scalar_value = reduc_info->reduc_initial_values[i]; 5813 scalar_value = gimple_convert (&seq, TREE_TYPE (vectype), 5814 scalar_value); 5815 vector_identity = gimple_build_vector_from_val (&seq, vectype, 5816 scalar_value); 5817 } 5818 5819 /* Calculate the equivalent of: 5820 5821 sel[j] = (index[j] == i); 5822 5823 which selects the elements of REDUC_INPUTS[0] that should 5824 be included in the result. */ 5825 tree compare_val = build_int_cst (index_elt_type, i); 5826 compare_val = build_vector_from_val (index_type, compare_val); 5827 tree sel = gimple_build (&seq, EQ_EXPR, mask_type, 5828 index, compare_val); 5829 5830 /* Calculate the equivalent of: 5831 5832 vec = seq ? reduc_inputs[0] : vector_identity; 5833 5834 VEC is now suitable for a full vector reduction. */ 5835 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype, 5836 sel, reduc_inputs[0], vector_identity); 5837 5838 /* Do the reduction and convert it to the appropriate type. */ 5839 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn), 5840 TREE_TYPE (vectype), vec); 5841 scalar = gimple_convert (&seq, scalar_type, scalar); 5842 scalar_results.safe_push (scalar); 5843 } 5844 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT); 5845 } 5846 else 5847 { 5848 bool reduce_with_shift; 5849 tree vec_temp; 5850 5851 gcc_assert (slp_reduc || reduc_inputs.length () == 1); 5852 5853 /* See if the target wants to do the final (shift) reduction 5854 in a vector mode of smaller size and first reduce upper/lower 5855 halves against each other. */ 5856 enum machine_mode mode1 = mode; 5857 tree stype = TREE_TYPE (vectype); 5858 unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant (); 5859 unsigned nunits1 = nunits; 5860 if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode 5861 && reduc_inputs.length () == 1) 5862 { 5863 nunits1 = GET_MODE_NUNITS (mode1).to_constant (); 5864 /* For SLP reductions we have to make sure lanes match up, but 5865 since we're doing individual element final reduction reducing 5866 vector width here is even more important. 5867 ??? We can also separate lanes with permutes, for the common 5868 case of power-of-two group-size odd/even extracts would work. */ 5869 if (slp_reduc && nunits != nunits1) 5870 { 5871 nunits1 = least_common_multiple (nunits1, group_size); 5872 gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits); 5873 } 5874 } 5875 if (!slp_reduc 5876 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode) 5877 nunits1 = GET_MODE_NUNITS (mode1).to_constant (); 5878 5879 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype), 5880 stype, nunits1); 5881 reduce_with_shift = have_whole_vector_shift (mode1); 5882 if (!VECTOR_MODE_P (mode1) 5883 || !directly_supported_p (code, vectype1)) 5884 reduce_with_shift = false; 5885 5886 /* First reduce the vector to the desired vector size we should 5887 do shift reduction on by combining upper and lower halves. */ 5888 gimple_seq stmts = NULL; 5889 new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1, 5890 code, &stmts); 5891 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); 5892 reduc_inputs[0] = new_temp; 5893 5894 if (reduce_with_shift && !slp_reduc) 5895 { 5896 int element_bitsize = tree_to_uhwi (bitsize); 5897 /* Enforced by vectorizable_reduction, which disallows SLP reductions 5898 for variable-length vectors and also requires direct target support 5899 for loop reductions. */ 5900 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1)); 5901 int nelements = vec_size_in_bits / element_bitsize; 5902 vec_perm_builder sel; 5903 vec_perm_indices indices; 5904 5905 int elt_offset; 5906 5907 tree zero_vec = build_zero_cst (vectype1); 5908 /* Case 2: Create: 5909 for (offset = nelements/2; offset >= 1; offset/=2) 5910 { 5911 Create: va' = vec_shift <va, offset> 5912 Create: va = vop <va, va'> 5913 } */ 5914 5915 tree rhs; 5916 5917 if (dump_enabled_p ()) 5918 dump_printf_loc (MSG_NOTE, vect_location, 5919 "Reduce using vector shifts\n"); 5920 5921 gimple_seq stmts = NULL; 5922 new_temp = gimple_convert (&stmts, vectype1, new_temp); 5923 for (elt_offset = nelements / 2; 5924 elt_offset >= 1; 5925 elt_offset /= 2) 5926 { 5927 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel); 5928 indices.new_vector (sel, 2, nelements); 5929 tree mask = vect_gen_perm_mask_any (vectype1, indices); 5930 new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1, 5931 new_temp, zero_vec, mask); 5932 new_temp = gimple_build (&stmts, code, 5933 vectype1, new_name, new_temp); 5934 } 5935 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); 5936 5937 /* 2.4 Extract the final scalar result. Create: 5938 s_out3 = extract_field <v_out2, bitpos> */ 5939 5940 if (dump_enabled_p ()) 5941 dump_printf_loc (MSG_NOTE, vect_location, 5942 "extract scalar result\n"); 5943 5944 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, 5945 bitsize, bitsize_zero_node); 5946 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs); 5947 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); 5948 gimple_assign_set_lhs (epilog_stmt, new_temp); 5949 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5950 scalar_results.safe_push (new_temp); 5951 } 5952 else 5953 { 5954 /* Case 3: Create: 5955 s = extract_field <v_out2, 0> 5956 for (offset = element_size; 5957 offset < vector_size; 5958 offset += element_size;) 5959 { 5960 Create: s' = extract_field <v_out2, offset> 5961 Create: s = op <s, s'> // For non SLP cases 5962 } */ 5963 5964 if (dump_enabled_p ()) 5965 dump_printf_loc (MSG_NOTE, vect_location, 5966 "Reduce using scalar code.\n"); 5967 5968 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1)); 5969 int element_bitsize = tree_to_uhwi (bitsize); 5970 tree compute_type = TREE_TYPE (vectype); 5971 gimple_seq stmts = NULL; 5972 FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp) 5973 { 5974 int bit_offset; 5975 new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type, 5976 vec_temp, bitsize, bitsize_zero_node); 5977 5978 /* In SLP we don't need to apply reduction operation, so we just 5979 collect s' values in SCALAR_RESULTS. */ 5980 if (slp_reduc) 5981 scalar_results.safe_push (new_temp); 5982 5983 for (bit_offset = element_bitsize; 5984 bit_offset < vec_size_in_bits; 5985 bit_offset += element_bitsize) 5986 { 5987 tree bitpos = bitsize_int (bit_offset); 5988 new_name = gimple_build (&stmts, BIT_FIELD_REF, 5989 compute_type, vec_temp, 5990 bitsize, bitpos); 5991 if (slp_reduc) 5992 { 5993 /* In SLP we don't need to apply reduction operation, so 5994 we just collect s' values in SCALAR_RESULTS. */ 5995 new_temp = new_name; 5996 scalar_results.safe_push (new_name); 5997 } 5998 else 5999 new_temp = gimple_build (&stmts, code, compute_type, 6000 new_name, new_temp); 6001 } 6002 } 6003 6004 /* The only case where we need to reduce scalar results in SLP, is 6005 unrolling. If the size of SCALAR_RESULTS is greater than 6006 REDUC_GROUP_SIZE, we reduce them combining elements modulo 6007 REDUC_GROUP_SIZE. */ 6008 if (slp_reduc) 6009 { 6010 tree res, first_res, new_res; 6011 6012 /* Reduce multiple scalar results in case of SLP unrolling. */ 6013 for (j = group_size; scalar_results.iterate (j, &res); 6014 j++) 6015 { 6016 first_res = scalar_results[j % group_size]; 6017 new_res = gimple_build (&stmts, code, compute_type, 6018 first_res, res); 6019 scalar_results[j % group_size] = new_res; 6020 } 6021 scalar_results.truncate (group_size); 6022 for (k = 0; k < group_size; k++) 6023 scalar_results[k] = gimple_convert (&stmts, scalar_type, 6024 scalar_results[k]); 6025 } 6026 else 6027 { 6028 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */ 6029 new_temp = gimple_convert (&stmts, scalar_type, new_temp); 6030 scalar_results.safe_push (new_temp); 6031 } 6032 6033 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); 6034 } 6035 6036 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION) 6037 && induc_val) 6038 { 6039 /* Earlier we set the initial value to be a vector if induc_val 6040 values. Check the result and if it is induc_val then replace 6041 with the original initial value, unless induc_val is 6042 the same as initial_def already. */ 6043 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp, 6044 induc_val); 6045 tree initial_def = reduc_info->reduc_initial_values[0]; 6046 6047 tree tmp = make_ssa_name (new_scalar_dest); 6048 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare, 6049 initial_def, new_temp); 6050 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 6051 scalar_results[0] = tmp; 6052 } 6053 } 6054 6055 /* 2.5 Adjust the final result by the initial value of the reduction 6056 variable. (When such adjustment is not needed, then 6057 'adjustment_def' is zero). For example, if code is PLUS we create: 6058 new_temp = loop_exit_def + adjustment_def */ 6059 6060 if (adjustment_def) 6061 { 6062 gcc_assert (!slp_reduc); 6063 gimple_seq stmts = NULL; 6064 if (double_reduc) 6065 { 6066 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def))); 6067 adjustment_def = gimple_convert (&stmts, vectype, adjustment_def); 6068 new_temp = gimple_build (&stmts, code, vectype, 6069 reduc_inputs[0], adjustment_def); 6070 } 6071 else 6072 { 6073 new_temp = scalar_results[0]; 6074 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE); 6075 adjustment_def = gimple_convert (&stmts, TREE_TYPE (vectype), 6076 adjustment_def); 6077 new_temp = gimple_convert (&stmts, TREE_TYPE (vectype), new_temp); 6078 new_temp = gimple_build (&stmts, code, TREE_TYPE (vectype), 6079 new_temp, adjustment_def); 6080 new_temp = gimple_convert (&stmts, scalar_type, new_temp); 6081 } 6082 6083 epilog_stmt = gimple_seq_last_stmt (stmts); 6084 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); 6085 scalar_results[0] = new_temp; 6086 } 6087 6088 /* Record this operation if it could be reused by the epilogue loop. */ 6089 if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION 6090 && vec_num == 1) 6091 loop_vinfo->reusable_accumulators.put (scalar_results[0], 6092 { orig_reduc_input, reduc_info }); 6093 6094 if (double_reduc) 6095 loop = outer_loop; 6096 6097 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit 6098 phis with new adjusted scalar results, i.e., replace use <s_out0> 6099 with use <s_out4>. 6100 6101 Transform: 6102 loop_exit: 6103 s_out0 = phi <s_loop> # (scalar) EXIT_PHI 6104 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI 6105 v_out2 = reduce <v_out1> 6106 s_out3 = extract_field <v_out2, 0> 6107 s_out4 = adjust_result <s_out3> 6108 use <s_out0> 6109 use <s_out0> 6110 6111 into: 6112 6113 loop_exit: 6114 s_out0 = phi <s_loop> # (scalar) EXIT_PHI 6115 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI 6116 v_out2 = reduce <v_out1> 6117 s_out3 = extract_field <v_out2, 0> 6118 s_out4 = adjust_result <s_out3> 6119 use <s_out4> 6120 use <s_out4> */ 6121 6122 gcc_assert (live_out_stmts.size () == scalar_results.length ()); 6123 for (k = 0; k < live_out_stmts.size (); k++) 6124 { 6125 stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]); 6126 scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt); 6127 6128 phis.create (3); 6129 /* Find the loop-closed-use at the loop exit of the original scalar 6130 result. (The reduction result is expected to have two immediate uses, 6131 one at the latch block, and one at the loop exit). For double 6132 reductions we are looking for exit phis of the outer loop. */ 6133 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest) 6134 { 6135 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))) 6136 { 6137 if (!is_gimple_debug (USE_STMT (use_p))) 6138 phis.safe_push (USE_STMT (use_p)); 6139 } 6140 else 6141 { 6142 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI) 6143 { 6144 tree phi_res = PHI_RESULT (USE_STMT (use_p)); 6145 6146 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res) 6147 { 6148 if (!flow_bb_inside_loop_p (loop, 6149 gimple_bb (USE_STMT (phi_use_p))) 6150 && !is_gimple_debug (USE_STMT (phi_use_p))) 6151 phis.safe_push (USE_STMT (phi_use_p)); 6152 } 6153 } 6154 } 6155 } 6156 6157 FOR_EACH_VEC_ELT (phis, i, exit_phi) 6158 { 6159 /* Replace the uses: */ 6160 orig_name = PHI_RESULT (exit_phi); 6161 6162 /* Look for a single use at the target of the skip edge. */ 6163 if (unify_with_main_loop_p) 6164 { 6165 use_operand_p use_p; 6166 gimple *user; 6167 if (!single_imm_use (orig_name, &use_p, &user)) 6168 gcc_unreachable (); 6169 orig_name = gimple_get_lhs (user); 6170 } 6171 6172 scalar_result = scalar_results[k]; 6173 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name) 6174 { 6175 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter) 6176 SET_USE (use_p, scalar_result); 6177 update_stmt (use_stmt); 6178 } 6179 } 6180 6181 phis.release (); 6182 } 6183 } 6184 6185 /* Return a vector of type VECTYPE that is equal to the vector select 6186 operation "MASK ? VEC : IDENTITY". Insert the select statements 6187 before GSI. */ 6188 6189 static tree 6190 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype, 6191 tree vec, tree identity) 6192 { 6193 tree cond = make_temp_ssa_name (vectype, NULL, "cond"); 6194 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR, 6195 mask, vec, identity); 6196 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); 6197 return cond; 6198 } 6199 6200 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right 6201 order, starting with LHS. Insert the extraction statements before GSI and 6202 associate the new scalar SSA names with variable SCALAR_DEST. 6203 Return the SSA name for the result. */ 6204 6205 static tree 6206 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest, 6207 tree_code code, tree lhs, tree vector_rhs) 6208 { 6209 tree vectype = TREE_TYPE (vector_rhs); 6210 tree scalar_type = TREE_TYPE (vectype); 6211 tree bitsize = TYPE_SIZE (scalar_type); 6212 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype)); 6213 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize); 6214 6215 for (unsigned HOST_WIDE_INT bit_offset = 0; 6216 bit_offset < vec_size_in_bits; 6217 bit_offset += element_bitsize) 6218 { 6219 tree bitpos = bitsize_int (bit_offset); 6220 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs, 6221 bitsize, bitpos); 6222 6223 gassign *stmt = gimple_build_assign (scalar_dest, rhs); 6224 rhs = make_ssa_name (scalar_dest, stmt); 6225 gimple_assign_set_lhs (stmt, rhs); 6226 gsi_insert_before (gsi, stmt, GSI_SAME_STMT); 6227 6228 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs); 6229 tree new_name = make_ssa_name (scalar_dest, stmt); 6230 gimple_assign_set_lhs (stmt, new_name); 6231 gsi_insert_before (gsi, stmt, GSI_SAME_STMT); 6232 lhs = new_name; 6233 } 6234 return lhs; 6235 } 6236 6237 /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the 6238 type of the vector input. */ 6239 6240 static internal_fn 6241 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in) 6242 { 6243 internal_fn mask_reduc_fn; 6244 6245 switch (reduc_fn) 6246 { 6247 case IFN_FOLD_LEFT_PLUS: 6248 mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS; 6249 break; 6250 6251 default: 6252 return IFN_LAST; 6253 } 6254 6255 if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in, 6256 OPTIMIZE_FOR_SPEED)) 6257 return mask_reduc_fn; 6258 return IFN_LAST; 6259 } 6260 6261 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the 6262 statement that sets the live-out value. REDUC_DEF_STMT is the phi 6263 statement. CODE is the operation performed by STMT_INFO and OPS are 6264 its scalar operands. REDUC_INDEX is the index of the operand in 6265 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that 6266 implements in-order reduction, or IFN_LAST if we should open-code it. 6267 VECTYPE_IN is the type of the vector input. MASKS specifies the masks 6268 that should be used to control the operation in a fully-masked loop. */ 6269 6270 static bool 6271 vectorize_fold_left_reduction (loop_vec_info loop_vinfo, 6272 stmt_vec_info stmt_info, 6273 gimple_stmt_iterator *gsi, 6274 gimple **vec_stmt, slp_tree slp_node, 6275 gimple *reduc_def_stmt, 6276 tree_code code, internal_fn reduc_fn, 6277 tree ops[3], tree vectype_in, 6278 int reduc_index, vec_loop_masks *masks) 6279 { 6280 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 6281 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info); 6282 internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in); 6283 6284 int ncopies; 6285 if (slp_node) 6286 ncopies = 1; 6287 else 6288 ncopies = vect_get_num_copies (loop_vinfo, vectype_in); 6289 6290 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info)); 6291 gcc_assert (ncopies == 1); 6292 gcc_assert (TREE_CODE_LENGTH (code) == binary_op); 6293 6294 if (slp_node) 6295 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out), 6296 TYPE_VECTOR_SUBPARTS (vectype_in))); 6297 6298 tree op0 = ops[1 - reduc_index]; 6299 6300 int group_size = 1; 6301 stmt_vec_info scalar_dest_def_info; 6302 auto_vec<tree> vec_oprnds0; 6303 if (slp_node) 6304 { 6305 auto_vec<vec<tree> > vec_defs (2); 6306 vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs); 6307 vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]); 6308 vec_defs[0].release (); 6309 vec_defs[1].release (); 6310 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length (); 6311 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]; 6312 } 6313 else 6314 { 6315 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1, 6316 op0, &vec_oprnds0); 6317 scalar_dest_def_info = stmt_info; 6318 } 6319 6320 tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt); 6321 tree scalar_type = TREE_TYPE (scalar_dest); 6322 tree reduc_var = gimple_phi_result (reduc_def_stmt); 6323 6324 int vec_num = vec_oprnds0.length (); 6325 gcc_assert (vec_num == 1 || slp_node); 6326 tree vec_elem_type = TREE_TYPE (vectype_out); 6327 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type)); 6328 6329 tree vector_identity = NULL_TREE; 6330 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) 6331 vector_identity = build_zero_cst (vectype_out); 6332 6333 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL); 6334 int i; 6335 tree def0; 6336 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0) 6337 { 6338 gimple *new_stmt; 6339 tree mask = NULL_TREE; 6340 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) 6341 mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i); 6342 6343 /* Handle MINUS by adding the negative. */ 6344 if (reduc_fn != IFN_LAST && code == MINUS_EXPR) 6345 { 6346 tree negated = make_ssa_name (vectype_out); 6347 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0); 6348 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); 6349 def0 = negated; 6350 } 6351 6352 if (mask && mask_reduc_fn == IFN_LAST) 6353 def0 = merge_with_identity (gsi, mask, vectype_out, def0, 6354 vector_identity); 6355 6356 /* On the first iteration the input is simply the scalar phi 6357 result, and for subsequent iterations it is the output of 6358 the preceding operation. */ 6359 if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST)) 6360 { 6361 if (mask && mask_reduc_fn != IFN_LAST) 6362 new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var, 6363 def0, mask); 6364 else 6365 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, 6366 def0); 6367 /* For chained SLP reductions the output of the previous reduction 6368 operation serves as the input of the next. For the final statement 6369 the output cannot be a temporary - we reuse the original 6370 scalar destination of the last statement. */ 6371 if (i != vec_num - 1) 6372 { 6373 gimple_set_lhs (new_stmt, scalar_dest_var); 6374 reduc_var = make_ssa_name (scalar_dest_var, new_stmt); 6375 gimple_set_lhs (new_stmt, reduc_var); 6376 } 6377 } 6378 else 6379 { 6380 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code, 6381 reduc_var, def0); 6382 new_stmt = SSA_NAME_DEF_STMT (reduc_var); 6383 /* Remove the statement, so that we can use the same code paths 6384 as for statements that we've just created. */ 6385 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt); 6386 gsi_remove (&tmp_gsi, true); 6387 } 6388 6389 if (i == vec_num - 1) 6390 { 6391 gimple_set_lhs (new_stmt, scalar_dest); 6392 vect_finish_replace_stmt (loop_vinfo, 6393 scalar_dest_def_info, 6394 new_stmt); 6395 } 6396 else 6397 vect_finish_stmt_generation (loop_vinfo, 6398 scalar_dest_def_info, 6399 new_stmt, gsi); 6400 6401 if (slp_node) 6402 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt); 6403 else 6404 { 6405 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt); 6406 *vec_stmt = new_stmt; 6407 } 6408 } 6409 6410 return true; 6411 } 6412 6413 /* Function is_nonwrapping_integer_induction. 6414 6415 Check if STMT_VINO (which is part of loop LOOP) both increments and 6416 does not cause overflow. */ 6417 6418 static bool 6419 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop) 6420 { 6421 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt); 6422 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo); 6423 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo); 6424 tree lhs_type = TREE_TYPE (gimple_phi_result (phi)); 6425 widest_int ni, max_loop_value, lhs_max; 6426 wi::overflow_type overflow = wi::OVF_NONE; 6427 6428 /* Make sure the loop is integer based. */ 6429 if (TREE_CODE (base) != INTEGER_CST 6430 || TREE_CODE (step) != INTEGER_CST) 6431 return false; 6432 6433 /* Check that the max size of the loop will not wrap. */ 6434 6435 if (TYPE_OVERFLOW_UNDEFINED (lhs_type)) 6436 return true; 6437 6438 if (! max_stmt_executions (loop, &ni)) 6439 return false; 6440 6441 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type), 6442 &overflow); 6443 if (overflow) 6444 return false; 6445 6446 max_loop_value = wi::add (wi::to_widest (base), max_loop_value, 6447 TYPE_SIGN (lhs_type), &overflow); 6448 if (overflow) 6449 return false; 6450 6451 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type)) 6452 <= TYPE_PRECISION (lhs_type)); 6453 } 6454 6455 /* Check if masking can be supported by inserting a conditional expression. 6456 CODE is the code for the operation. COND_FN is the conditional internal 6457 function, if it exists. VECTYPE_IN is the type of the vector input. */ 6458 static bool 6459 use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn, 6460 tree vectype_in) 6461 { 6462 if (cond_fn != IFN_LAST 6463 && direct_internal_fn_supported_p (cond_fn, vectype_in, 6464 OPTIMIZE_FOR_SPEED)) 6465 return false; 6466 6467 if (code.is_tree_code ()) 6468 switch (tree_code (code)) 6469 { 6470 case DOT_PROD_EXPR: 6471 case SAD_EXPR: 6472 return true; 6473 6474 default: 6475 break; 6476 } 6477 return false; 6478 } 6479 6480 /* Insert a conditional expression to enable masked vectorization. CODE is the 6481 code for the operation. VOP is the array of operands. MASK is the loop 6482 mask. GSI is a statement iterator used to place the new conditional 6483 expression. */ 6484 static void 6485 build_vect_cond_expr (code_helper code, tree vop[3], tree mask, 6486 gimple_stmt_iterator *gsi) 6487 { 6488 switch (tree_code (code)) 6489 { 6490 case DOT_PROD_EXPR: 6491 { 6492 tree vectype = TREE_TYPE (vop[1]); 6493 tree zero = build_zero_cst (vectype); 6494 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1"); 6495 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR, 6496 mask, vop[1], zero); 6497 gsi_insert_before (gsi, select, GSI_SAME_STMT); 6498 vop[1] = masked_op1; 6499 break; 6500 } 6501 6502 case SAD_EXPR: 6503 { 6504 tree vectype = TREE_TYPE (vop[1]); 6505 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1"); 6506 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR, 6507 mask, vop[1], vop[0]); 6508 gsi_insert_before (gsi, select, GSI_SAME_STMT); 6509 vop[1] = masked_op1; 6510 break; 6511 } 6512 6513 default: 6514 gcc_unreachable (); 6515 } 6516 } 6517 6518 /* Function vectorizable_reduction. 6519 6520 Check if STMT_INFO performs a reduction operation that can be vectorized. 6521 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized 6522 stmt to replace it, put it in VEC_STMT, and insert it at GSI. 6523 Return true if STMT_INFO is vectorizable in this way. 6524 6525 This function also handles reduction idioms (patterns) that have been 6526 recognized in advance during vect_pattern_recog. In this case, STMT_INFO 6527 may be of this form: 6528 X = pattern_expr (arg0, arg1, ..., X) 6529 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original 6530 sequence that had been detected and replaced by the pattern-stmt 6531 (STMT_INFO). 6532 6533 This function also handles reduction of condition expressions, for example: 6534 for (int i = 0; i < N; i++) 6535 if (a[i] < value) 6536 last = a[i]; 6537 This is handled by vectorising the loop and creating an additional vector 6538 containing the loop indexes for which "a[i] < value" was true. In the 6539 function epilogue this is reduced to a single max value and then used to 6540 index into the vector of results. 6541 6542 In some cases of reduction patterns, the type of the reduction variable X is 6543 different than the type of the other arguments of STMT_INFO. 6544 In such cases, the vectype that is used when transforming STMT_INFO into 6545 a vector stmt is different than the vectype that is used to determine the 6546 vectorization factor, because it consists of a different number of elements 6547 than the actual number of elements that are being operated upon in parallel. 6548 6549 For example, consider an accumulation of shorts into an int accumulator. 6550 On some targets it's possible to vectorize this pattern operating on 8 6551 shorts at a time (hence, the vectype for purposes of determining the 6552 vectorization factor should be V8HI); on the other hand, the vectype that 6553 is used to create the vector form is actually V4SI (the type of the result). 6554 6555 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that 6556 indicates what is the actual level of parallelism (V8HI in the example), so 6557 that the right vectorization factor would be derived. This vectype 6558 corresponds to the type of arguments to the reduction stmt, and should *NOT* 6559 be used to create the vectorized stmt. The right vectype for the vectorized 6560 stmt is obtained from the type of the result X: 6561 get_vectype_for_scalar_type (vinfo, TREE_TYPE (X)) 6562 6563 This means that, contrary to "regular" reductions (or "regular" stmts in 6564 general), the following equation: 6565 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X)) 6566 does *NOT* necessarily hold for reduction patterns. */ 6567 6568 bool 6569 vectorizable_reduction (loop_vec_info loop_vinfo, 6570 stmt_vec_info stmt_info, slp_tree slp_node, 6571 slp_instance slp_node_instance, 6572 stmt_vector_for_cost *cost_vec) 6573 { 6574 tree vectype_in = NULL_TREE; 6575 tree vectype_op[3] = { NULL_TREE, NULL_TREE, NULL_TREE }; 6576 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 6577 enum vect_def_type cond_reduc_dt = vect_unknown_def_type; 6578 stmt_vec_info cond_stmt_vinfo = NULL; 6579 int i; 6580 int ncopies; 6581 bool single_defuse_cycle = false; 6582 bool nested_cycle = false; 6583 bool double_reduc = false; 6584 int vec_num; 6585 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE; 6586 tree cond_reduc_val = NULL_TREE; 6587 6588 /* Make sure it was already recognized as a reduction computation. */ 6589 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def 6590 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def 6591 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle) 6592 return false; 6593 6594 /* The stmt we store reduction analysis meta on. */ 6595 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info); 6596 reduc_info->is_reduc_info = true; 6597 6598 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle) 6599 { 6600 if (is_a <gphi *> (stmt_info->stmt)) 6601 { 6602 if (slp_node) 6603 { 6604 /* We eventually need to set a vector type on invariant 6605 arguments. */ 6606 unsigned j; 6607 slp_tree child; 6608 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child) 6609 if (!vect_maybe_update_slp_op_vectype 6610 (child, SLP_TREE_VECTYPE (slp_node))) 6611 { 6612 if (dump_enabled_p ()) 6613 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6614 "incompatible vector types for " 6615 "invariants\n"); 6616 return false; 6617 } 6618 } 6619 /* Analysis for double-reduction is done on the outer 6620 loop PHI, nested cycles have no further restrictions. */ 6621 STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type; 6622 } 6623 else 6624 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type; 6625 return true; 6626 } 6627 6628 stmt_vec_info orig_stmt_of_analysis = stmt_info; 6629 stmt_vec_info phi_info = stmt_info; 6630 if (!is_a <gphi *> (stmt_info->stmt)) 6631 { 6632 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type; 6633 return true; 6634 } 6635 if (slp_node) 6636 { 6637 slp_node_instance->reduc_phis = slp_node; 6638 /* ??? We're leaving slp_node to point to the PHIs, we only 6639 need it to get at the number of vector stmts which wasn't 6640 yet initialized for the instance root. */ 6641 } 6642 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def) 6643 stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info)); 6644 else 6645 { 6646 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) 6647 == vect_double_reduction_def); 6648 use_operand_p use_p; 6649 gimple *use_stmt; 6650 bool res = single_imm_use (gimple_phi_result (stmt_info->stmt), 6651 &use_p, &use_stmt); 6652 gcc_assert (res); 6653 phi_info = loop_vinfo->lookup_stmt (use_stmt); 6654 stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info)); 6655 } 6656 6657 /* PHIs should not participate in patterns. */ 6658 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info)); 6659 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt); 6660 6661 /* Verify following REDUC_IDX from the latch def leads us back to the PHI 6662 and compute the reduction chain length. Discover the real 6663 reduction operation stmt on the way (stmt_info and slp_for_stmt_info). */ 6664 tree reduc_def 6665 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi, 6666 loop_latch_edge 6667 (gimple_bb (reduc_def_phi)->loop_father)); 6668 unsigned reduc_chain_length = 0; 6669 bool only_slp_reduc_chain = true; 6670 stmt_info = NULL; 6671 slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL; 6672 while (reduc_def != PHI_RESULT (reduc_def_phi)) 6673 { 6674 stmt_vec_info def = loop_vinfo->lookup_def (reduc_def); 6675 stmt_vec_info vdef = vect_stmt_to_vectorize (def); 6676 if (STMT_VINFO_REDUC_IDX (vdef) == -1) 6677 { 6678 if (dump_enabled_p ()) 6679 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6680 "reduction chain broken by patterns.\n"); 6681 return false; 6682 } 6683 if (!REDUC_GROUP_FIRST_ELEMENT (vdef)) 6684 only_slp_reduc_chain = false; 6685 /* For epilogue generation live members of the chain need 6686 to point back to the PHI via their original stmt for 6687 info_for_reduction to work. For SLP we need to look at 6688 all lanes here - even though we only will vectorize from 6689 the SLP node with live lane zero the other live lanes also 6690 need to be identified as part of a reduction to be able 6691 to skip code generation for them. */ 6692 if (slp_for_stmt_info) 6693 { 6694 for (auto s : SLP_TREE_SCALAR_STMTS (slp_for_stmt_info)) 6695 if (STMT_VINFO_LIVE_P (s)) 6696 STMT_VINFO_REDUC_DEF (vect_orig_stmt (s)) = phi_info; 6697 } 6698 else if (STMT_VINFO_LIVE_P (vdef)) 6699 STMT_VINFO_REDUC_DEF (def) = phi_info; 6700 gimple_match_op op; 6701 if (!gimple_extract_op (vdef->stmt, &op)) 6702 { 6703 if (dump_enabled_p ()) 6704 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6705 "reduction chain includes unsupported" 6706 " statement type.\n"); 6707 return false; 6708 } 6709 if (CONVERT_EXPR_CODE_P (op.code)) 6710 { 6711 if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0]))) 6712 { 6713 if (dump_enabled_p ()) 6714 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6715 "conversion in the reduction chain.\n"); 6716 return false; 6717 } 6718 } 6719 else if (!stmt_info) 6720 /* First non-conversion stmt. */ 6721 stmt_info = vdef; 6722 reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)]; 6723 reduc_chain_length++; 6724 if (!stmt_info && slp_node) 6725 slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0]; 6726 } 6727 /* PHIs should not participate in patterns. */ 6728 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info)); 6729 6730 if (nested_in_vect_loop_p (loop, stmt_info)) 6731 { 6732 loop = loop->inner; 6733 nested_cycle = true; 6734 } 6735 6736 /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last 6737 element. */ 6738 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info)) 6739 { 6740 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info)); 6741 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info); 6742 } 6743 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)) 6744 gcc_assert (slp_node 6745 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info); 6746 6747 /* 1. Is vectorizable reduction? */ 6748 /* Not supportable if the reduction variable is used in the loop, unless 6749 it's a reduction chain. */ 6750 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer 6751 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)) 6752 return false; 6753 6754 /* Reductions that are not used even in an enclosing outer-loop, 6755 are expected to be "live" (used out of the loop). */ 6756 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope 6757 && !STMT_VINFO_LIVE_P (stmt_info)) 6758 return false; 6759 6760 /* 2. Has this been recognized as a reduction pattern? 6761 6762 Check if STMT represents a pattern that has been recognized 6763 in earlier analysis stages. For stmts that represent a pattern, 6764 the STMT_VINFO_RELATED_STMT field records the last stmt in 6765 the original sequence that constitutes the pattern. */ 6766 6767 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info); 6768 if (orig_stmt_info) 6769 { 6770 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)); 6771 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info)); 6772 } 6773 6774 /* 3. Check the operands of the operation. The first operands are defined 6775 inside the loop body. The last operand is the reduction variable, 6776 which is defined by the loop-header-phi. */ 6777 6778 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info); 6779 STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out; 6780 gimple_match_op op; 6781 if (!gimple_extract_op (stmt_info->stmt, &op)) 6782 gcc_unreachable (); 6783 bool lane_reduc_code_p = (op.code == DOT_PROD_EXPR 6784 || op.code == WIDEN_SUM_EXPR 6785 || op.code == SAD_EXPR); 6786 enum optab_subtype optab_query_kind = optab_vector; 6787 if (op.code == DOT_PROD_EXPR 6788 && (TYPE_SIGN (TREE_TYPE (op.ops[0])) 6789 != TYPE_SIGN (TREE_TYPE (op.ops[1])))) 6790 optab_query_kind = optab_vector_mixed_sign; 6791 6792 if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type) 6793 && !SCALAR_FLOAT_TYPE_P (op.type)) 6794 return false; 6795 6796 /* Do not try to vectorize bit-precision reductions. */ 6797 if (!type_has_mode_precision_p (op.type)) 6798 return false; 6799 6800 /* For lane-reducing ops we're reducing the number of reduction PHIs 6801 which means the only use of that may be in the lane-reducing operation. */ 6802 if (lane_reduc_code_p 6803 && reduc_chain_length != 1 6804 && !only_slp_reduc_chain) 6805 { 6806 if (dump_enabled_p ()) 6807 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6808 "lane-reducing reduction with extra stmts.\n"); 6809 return false; 6810 } 6811 6812 /* All uses but the last are expected to be defined in the loop. 6813 The last use is the reduction variable. In case of nested cycle this 6814 assumption is not true: we use reduc_index to record the index of the 6815 reduction variable. */ 6816 slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops); 6817 /* We need to skip an extra operand for COND_EXPRs with embedded 6818 comparison. */ 6819 unsigned opno_adjust = 0; 6820 if (op.code == COND_EXPR && COMPARISON_CLASS_P (op.ops[0])) 6821 opno_adjust = 1; 6822 for (i = 0; i < (int) op.num_ops; i++) 6823 { 6824 /* The condition of COND_EXPR is checked in vectorizable_condition(). */ 6825 if (i == 0 && op.code == COND_EXPR) 6826 continue; 6827 6828 stmt_vec_info def_stmt_info; 6829 enum vect_def_type dt; 6830 if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info, 6831 i + opno_adjust, &op.ops[i], &slp_op[i], &dt, 6832 &vectype_op[i], &def_stmt_info)) 6833 { 6834 if (dump_enabled_p ()) 6835 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6836 "use not simple.\n"); 6837 return false; 6838 } 6839 if (i == STMT_VINFO_REDUC_IDX (stmt_info)) 6840 continue; 6841 6842 /* There should be only one cycle def in the stmt, the one 6843 leading to reduc_def. */ 6844 if (VECTORIZABLE_CYCLE_DEF (dt)) 6845 return false; 6846 6847 if (!vectype_op[i]) 6848 vectype_op[i] 6849 = get_vectype_for_scalar_type (loop_vinfo, 6850 TREE_TYPE (op.ops[i]), slp_op[i]); 6851 6852 /* To properly compute ncopies we are interested in the widest 6853 non-reduction input type in case we're looking at a widening 6854 accumulation that we later handle in vect_transform_reduction. */ 6855 if (lane_reduc_code_p 6856 && vectype_op[i] 6857 && (!vectype_in 6858 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in))) 6859 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_op[i])))))) 6860 vectype_in = vectype_op[i]; 6861 6862 if (op.code == COND_EXPR) 6863 { 6864 /* Record how the non-reduction-def value of COND_EXPR is defined. */ 6865 if (dt == vect_constant_def) 6866 { 6867 cond_reduc_dt = dt; 6868 cond_reduc_val = op.ops[i]; 6869 } 6870 if (dt == vect_induction_def 6871 && def_stmt_info 6872 && is_nonwrapping_integer_induction (def_stmt_info, loop)) 6873 { 6874 cond_reduc_dt = dt; 6875 cond_stmt_vinfo = def_stmt_info; 6876 } 6877 } 6878 } 6879 if (!vectype_in) 6880 vectype_in = STMT_VINFO_VECTYPE (phi_info); 6881 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in; 6882 6883 enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info); 6884 STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type; 6885 /* If we have a condition reduction, see if we can simplify it further. */ 6886 if (v_reduc_type == COND_REDUCTION) 6887 { 6888 if (slp_node) 6889 return false; 6890 6891 /* When the condition uses the reduction value in the condition, fail. */ 6892 if (STMT_VINFO_REDUC_IDX (stmt_info) == 0) 6893 { 6894 if (dump_enabled_p ()) 6895 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6896 "condition depends on previous iteration\n"); 6897 return false; 6898 } 6899 6900 if (reduc_chain_length == 1 6901 && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST, 6902 vectype_in, OPTIMIZE_FOR_SPEED)) 6903 { 6904 if (dump_enabled_p ()) 6905 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6906 "optimizing condition reduction with" 6907 " FOLD_EXTRACT_LAST.\n"); 6908 STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION; 6909 } 6910 else if (cond_reduc_dt == vect_induction_def) 6911 { 6912 tree base 6913 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo); 6914 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo); 6915 6916 gcc_assert (TREE_CODE (base) == INTEGER_CST 6917 && TREE_CODE (step) == INTEGER_CST); 6918 cond_reduc_val = NULL_TREE; 6919 enum tree_code cond_reduc_op_code = ERROR_MARK; 6920 tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo)); 6921 if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base))) 6922 ; 6923 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR 6924 above base; punt if base is the minimum value of the type for 6925 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */ 6926 else if (tree_int_cst_sgn (step) == -1) 6927 { 6928 cond_reduc_op_code = MIN_EXPR; 6929 if (tree_int_cst_sgn (base) == -1) 6930 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0); 6931 else if (tree_int_cst_lt (base, 6932 TYPE_MAX_VALUE (TREE_TYPE (base)))) 6933 cond_reduc_val 6934 = int_const_binop (PLUS_EXPR, base, integer_one_node); 6935 } 6936 else 6937 { 6938 cond_reduc_op_code = MAX_EXPR; 6939 if (tree_int_cst_sgn (base) == 1) 6940 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0); 6941 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)), 6942 base)) 6943 cond_reduc_val 6944 = int_const_binop (MINUS_EXPR, base, integer_one_node); 6945 } 6946 if (cond_reduc_val) 6947 { 6948 if (dump_enabled_p ()) 6949 dump_printf_loc (MSG_NOTE, vect_location, 6950 "condition expression based on " 6951 "integer induction.\n"); 6952 STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code; 6953 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) 6954 = cond_reduc_val; 6955 STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION; 6956 } 6957 } 6958 else if (cond_reduc_dt == vect_constant_def) 6959 { 6960 enum vect_def_type cond_initial_dt; 6961 tree cond_initial_val = vect_phi_initial_value (reduc_def_phi); 6962 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt); 6963 if (cond_initial_dt == vect_constant_def 6964 && types_compatible_p (TREE_TYPE (cond_initial_val), 6965 TREE_TYPE (cond_reduc_val))) 6966 { 6967 tree e = fold_binary (LE_EXPR, boolean_type_node, 6968 cond_initial_val, cond_reduc_val); 6969 if (e && (integer_onep (e) || integer_zerop (e))) 6970 { 6971 if (dump_enabled_p ()) 6972 dump_printf_loc (MSG_NOTE, vect_location, 6973 "condition expression based on " 6974 "compile time constant.\n"); 6975 /* Record reduction code at analysis stage. */ 6976 STMT_VINFO_REDUC_CODE (reduc_info) 6977 = integer_onep (e) ? MAX_EXPR : MIN_EXPR; 6978 STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION; 6979 } 6980 } 6981 } 6982 } 6983 6984 if (STMT_VINFO_LIVE_P (phi_info)) 6985 return false; 6986 6987 if (slp_node) 6988 ncopies = 1; 6989 else 6990 ncopies = vect_get_num_copies (loop_vinfo, vectype_in); 6991 6992 gcc_assert (ncopies >= 1); 6993 6994 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out); 6995 6996 if (nested_cycle) 6997 { 6998 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) 6999 == vect_double_reduction_def); 7000 double_reduc = true; 7001 } 7002 7003 /* 4.2. Check support for the epilog operation. 7004 7005 If STMT represents a reduction pattern, then the type of the 7006 reduction variable may be different than the type of the rest 7007 of the arguments. For example, consider the case of accumulation 7008 of shorts into an int accumulator; The original code: 7009 S1: int_a = (int) short_a; 7010 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>; 7011 7012 was replaced with: 7013 STMT: int_acc = widen_sum <short_a, int_acc> 7014 7015 This means that: 7016 1. The tree-code that is used to create the vector operation in the 7017 epilog code (that reduces the partial results) is not the 7018 tree-code of STMT, but is rather the tree-code of the original 7019 stmt from the pattern that STMT is replacing. I.e, in the example 7020 above we want to use 'widen_sum' in the loop, but 'plus' in the 7021 epilog. 7022 2. The type (mode) we use to check available target support 7023 for the vector operation to be created in the *epilog*, is 7024 determined by the type of the reduction variable (in the example 7025 above we'd check this: optab_handler (plus_optab, vect_int_mode])). 7026 However the type (mode) we use to check available target support 7027 for the vector operation to be created *inside the loop*, is 7028 determined by the type of the other arguments to STMT (in the 7029 example we'd check this: optab_handler (widen_sum_optab, 7030 vect_short_mode)). 7031 7032 This is contrary to "regular" reductions, in which the types of all 7033 the arguments are the same as the type of the reduction variable. 7034 For "regular" reductions we can therefore use the same vector type 7035 (and also the same tree-code) when generating the epilog code and 7036 when generating the code inside the loop. */ 7037 7038 code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info); 7039 STMT_VINFO_REDUC_CODE (reduc_info) = orig_code; 7040 7041 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info); 7042 if (reduction_type == TREE_CODE_REDUCTION) 7043 { 7044 /* Check whether it's ok to change the order of the computation. 7045 Generally, when vectorizing a reduction we change the order of the 7046 computation. This may change the behavior of the program in some 7047 cases, so we need to check that this is ok. One exception is when 7048 vectorizing an outer-loop: the inner-loop is executed sequentially, 7049 and therefore vectorizing reductions in the inner-loop during 7050 outer-loop vectorization is safe. Likewise when we are vectorizing 7051 a series of reductions using SLP and the VF is one the reductions 7052 are performed in scalar order. */ 7053 if (slp_node 7054 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info) 7055 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u)) 7056 ; 7057 else if (needs_fold_left_reduction_p (op.type, orig_code)) 7058 { 7059 /* When vectorizing a reduction chain w/o SLP the reduction PHI 7060 is not directy used in stmt. */ 7061 if (!only_slp_reduc_chain 7062 && reduc_chain_length != 1) 7063 { 7064 if (dump_enabled_p ()) 7065 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 7066 "in-order reduction chain without SLP.\n"); 7067 return false; 7068 } 7069 STMT_VINFO_REDUC_TYPE (reduc_info) 7070 = reduction_type = FOLD_LEFT_REDUCTION; 7071 } 7072 else if (!commutative_binary_op_p (orig_code, op.type) 7073 || !associative_binary_op_p (orig_code, op.type)) 7074 { 7075 if (dump_enabled_p ()) 7076 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 7077 "reduction: not commutative/associative"); 7078 return false; 7079 } 7080 } 7081 7082 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION) 7083 && ncopies > 1) 7084 { 7085 if (dump_enabled_p ()) 7086 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 7087 "multiple types in double reduction or condition " 7088 "reduction or fold-left reduction.\n"); 7089 return false; 7090 } 7091 7092 internal_fn reduc_fn = IFN_LAST; 7093 if (reduction_type == TREE_CODE_REDUCTION 7094 || reduction_type == FOLD_LEFT_REDUCTION 7095 || reduction_type == INTEGER_INDUC_COND_REDUCTION 7096 || reduction_type == CONST_COND_REDUCTION) 7097 { 7098 if (reduction_type == FOLD_LEFT_REDUCTION 7099 ? fold_left_reduction_fn (orig_code, &reduc_fn) 7100 : reduction_fn_for_scalar_code (orig_code, &reduc_fn)) 7101 { 7102 if (reduc_fn != IFN_LAST 7103 && !direct_internal_fn_supported_p (reduc_fn, vectype_out, 7104 OPTIMIZE_FOR_SPEED)) 7105 { 7106 if (dump_enabled_p ()) 7107 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 7108 "reduc op not supported by target.\n"); 7109 7110 reduc_fn = IFN_LAST; 7111 } 7112 } 7113 else 7114 { 7115 if (!nested_cycle || double_reduc) 7116 { 7117 if (dump_enabled_p ()) 7118 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 7119 "no reduc code for scalar code.\n"); 7120 7121 return false; 7122 } 7123 } 7124 } 7125 else if (reduction_type == COND_REDUCTION) 7126 { 7127 int scalar_precision 7128 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type)); 7129 cr_index_scalar_type = make_unsigned_type (scalar_precision); 7130 cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type, 7131 vectype_out); 7132 7133 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type, 7134 OPTIMIZE_FOR_SPEED)) 7135 reduc_fn = IFN_REDUC_MAX; 7136 } 7137 STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn; 7138 7139 if (reduction_type != EXTRACT_LAST_REDUCTION 7140 && (!nested_cycle || double_reduc) 7141 && reduc_fn == IFN_LAST 7142 && !nunits_out.is_constant ()) 7143 { 7144 if (dump_enabled_p ()) 7145 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 7146 "missing target support for reduction on" 7147 " variable-length vectors.\n"); 7148 return false; 7149 } 7150 7151 /* For SLP reductions, see if there is a neutral value we can use. */ 7152 tree neutral_op = NULL_TREE; 7153 if (slp_node) 7154 { 7155 tree initial_value = NULL_TREE; 7156 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL) 7157 initial_value = vect_phi_initial_value (reduc_def_phi); 7158 neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out), 7159 orig_code, initial_value); 7160 } 7161 7162 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION) 7163 { 7164 /* We can't support in-order reductions of code such as this: 7165 7166 for (int i = 0; i < n1; ++i) 7167 for (int j = 0; j < n2; ++j) 7168 l += a[j]; 7169 7170 since GCC effectively transforms the loop when vectorizing: 7171 7172 for (int i = 0; i < n1 / VF; ++i) 7173 for (int j = 0; j < n2; ++j) 7174 for (int k = 0; k < VF; ++k) 7175 l += a[j]; 7176 7177 which is a reassociation of the original operation. */ 7178 if (dump_enabled_p ()) 7179 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 7180 "in-order double reduction not supported.\n"); 7181 7182 return false; 7183 } 7184 7185 if (reduction_type == FOLD_LEFT_REDUCTION 7186 && slp_node 7187 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)) 7188 { 7189 /* We cannot use in-order reductions in this case because there is 7190 an implicit reassociation of the operations involved. */ 7191 if (dump_enabled_p ()) 7192 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 7193 "in-order unchained SLP reductions not supported.\n"); 7194 return false; 7195 } 7196 7197 /* For double reductions, and for SLP reductions with a neutral value, 7198 we construct a variable-length initial vector by loading a vector 7199 full of the neutral value and then shift-and-inserting the start 7200 values into the low-numbered elements. */ 7201 if ((double_reduc || neutral_op) 7202 && !nunits_out.is_constant () 7203 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT, 7204 vectype_out, OPTIMIZE_FOR_SPEED)) 7205 { 7206 if (dump_enabled_p ()) 7207 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 7208 "reduction on variable-length vectors requires" 7209 " target support for a vector-shift-and-insert" 7210 " operation.\n"); 7211 return false; 7212 } 7213 7214 /* Check extra constraints for variable-length unchained SLP reductions. */ 7215 if (slp_node 7216 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info) 7217 && !nunits_out.is_constant ()) 7218 { 7219 /* We checked above that we could build the initial vector when 7220 there's a neutral element value. Check here for the case in 7221 which each SLP statement has its own initial value and in which 7222 that value needs to be repeated for every instance of the 7223 statement within the initial vector. */ 7224 unsigned int group_size = SLP_TREE_LANES (slp_node); 7225 if (!neutral_op 7226 && !can_duplicate_and_interleave_p (loop_vinfo, group_size, 7227 TREE_TYPE (vectype_out))) 7228 { 7229 if (dump_enabled_p ()) 7230 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 7231 "unsupported form of SLP reduction for" 7232 " variable-length vectors: cannot build" 7233 " initial vector.\n"); 7234 return false; 7235 } 7236 /* The epilogue code relies on the number of elements being a multiple 7237 of the group size. The duplicate-and-interleave approach to setting 7238 up the initial vector does too. */ 7239 if (!multiple_p (nunits_out, group_size)) 7240 { 7241 if (dump_enabled_p ()) 7242 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 7243 "unsupported form of SLP reduction for" 7244 " variable-length vectors: the vector size" 7245 " is not a multiple of the number of results.\n"); 7246 return false; 7247 } 7248 } 7249 7250 if (reduction_type == COND_REDUCTION) 7251 { 7252 widest_int ni; 7253 7254 if (! max_loop_iterations (loop, &ni)) 7255 { 7256 if (dump_enabled_p ()) 7257 dump_printf_loc (MSG_NOTE, vect_location, 7258 "loop count not known, cannot create cond " 7259 "reduction.\n"); 7260 return false; 7261 } 7262 /* Convert backedges to iterations. */ 7263 ni += 1; 7264 7265 /* The additional index will be the same type as the condition. Check 7266 that the loop can fit into this less one (because we'll use up the 7267 zero slot for when there are no matches). */ 7268 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type); 7269 if (wi::geu_p (ni, wi::to_widest (max_index))) 7270 { 7271 if (dump_enabled_p ()) 7272 dump_printf_loc (MSG_NOTE, vect_location, 7273 "loop size is greater than data size.\n"); 7274 return false; 7275 } 7276 } 7277 7278 /* In case the vectorization factor (VF) is bigger than the number 7279 of elements that we can fit in a vectype (nunits), we have to generate 7280 more than one vector stmt - i.e - we need to "unroll" the 7281 vector stmt by a factor VF/nunits. For more details see documentation 7282 in vectorizable_operation. */ 7283 7284 /* If the reduction is used in an outer loop we need to generate 7285 VF intermediate results, like so (e.g. for ncopies=2): 7286 r0 = phi (init, r0) 7287 r1 = phi (init, r1) 7288 r0 = x0 + r0; 7289 r1 = x1 + r1; 7290 (i.e. we generate VF results in 2 registers). 7291 In this case we have a separate def-use cycle for each copy, and therefore 7292 for each copy we get the vector def for the reduction variable from the 7293 respective phi node created for this copy. 7294 7295 Otherwise (the reduction is unused in the loop nest), we can combine 7296 together intermediate results, like so (e.g. for ncopies=2): 7297 r = phi (init, r) 7298 r = x0 + r; 7299 r = x1 + r; 7300 (i.e. we generate VF/2 results in a single register). 7301 In this case for each copy we get the vector def for the reduction variable 7302 from the vectorized reduction operation generated in the previous iteration. 7303 7304 This only works when we see both the reduction PHI and its only consumer 7305 in vectorizable_reduction and there are no intermediate stmts 7306 participating. When unrolling we want each unrolled iteration to have its 7307 own reduction accumulator since one of the main goals of unrolling a 7308 reduction is to reduce the aggregate loop-carried latency. */ 7309 if (ncopies > 1 7310 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live) 7311 && reduc_chain_length == 1 7312 && loop_vinfo->suggested_unroll_factor == 1) 7313 single_defuse_cycle = true; 7314 7315 if (single_defuse_cycle || lane_reduc_code_p) 7316 { 7317 gcc_assert (op.code != COND_EXPR); 7318 7319 /* 4. Supportable by target? */ 7320 bool ok = true; 7321 7322 /* 4.1. check support for the operation in the loop */ 7323 machine_mode vec_mode = TYPE_MODE (vectype_in); 7324 if (!directly_supported_p (op.code, vectype_in, optab_query_kind)) 7325 { 7326 if (dump_enabled_p ()) 7327 dump_printf (MSG_NOTE, "op not supported by target.\n"); 7328 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD) 7329 || !vect_can_vectorize_without_simd_p (op.code)) 7330 ok = false; 7331 else 7332 if (dump_enabled_p ()) 7333 dump_printf (MSG_NOTE, "proceeding using word mode.\n"); 7334 } 7335 7336 if (vect_emulated_vector_p (vectype_in) 7337 && !vect_can_vectorize_without_simd_p (op.code)) 7338 { 7339 if (dump_enabled_p ()) 7340 dump_printf (MSG_NOTE, "using word mode not possible.\n"); 7341 return false; 7342 } 7343 7344 /* lane-reducing operations have to go through vect_transform_reduction. 7345 For the other cases try without the single cycle optimization. */ 7346 if (!ok) 7347 { 7348 if (lane_reduc_code_p) 7349 return false; 7350 else 7351 single_defuse_cycle = false; 7352 } 7353 } 7354 STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle; 7355 7356 /* If the reduction stmt is one of the patterns that have lane 7357 reduction embedded we cannot handle the case of ! single_defuse_cycle. */ 7358 if ((ncopies > 1 && ! single_defuse_cycle) 7359 && lane_reduc_code_p) 7360 { 7361 if (dump_enabled_p ()) 7362 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 7363 "multi def-use cycle not possible for lane-reducing " 7364 "reduction operation\n"); 7365 return false; 7366 } 7367 7368 if (slp_node 7369 && !(!single_defuse_cycle 7370 && !lane_reduc_code_p 7371 && reduction_type != FOLD_LEFT_REDUCTION)) 7372 for (i = 0; i < (int) op.num_ops; i++) 7373 if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i])) 7374 { 7375 if (dump_enabled_p ()) 7376 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 7377 "incompatible vector types for invariants\n"); 7378 return false; 7379 } 7380 7381 if (slp_node) 7382 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); 7383 else 7384 vec_num = 1; 7385 7386 vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn, 7387 reduction_type, ncopies, cost_vec); 7388 /* Cost the reduction op inside the loop if transformed via 7389 vect_transform_reduction. Otherwise this is costed by the 7390 separate vectorizable_* routines. */ 7391 if (single_defuse_cycle || lane_reduc_code_p) 7392 record_stmt_cost (cost_vec, ncopies, vector_stmt, stmt_info, 0, vect_body); 7393 7394 if (dump_enabled_p () 7395 && reduction_type == FOLD_LEFT_REDUCTION) 7396 dump_printf_loc (MSG_NOTE, vect_location, 7397 "using an in-order (fold-left) reduction.\n"); 7398 STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type; 7399 /* All but single defuse-cycle optimized, lane-reducing and fold-left 7400 reductions go through their own vectorizable_* routines. */ 7401 if (!single_defuse_cycle 7402 && !lane_reduc_code_p 7403 && reduction_type != FOLD_LEFT_REDUCTION) 7404 { 7405 stmt_vec_info tem 7406 = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info)); 7407 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem)) 7408 { 7409 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem)); 7410 tem = REDUC_GROUP_FIRST_ELEMENT (tem); 7411 } 7412 STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def; 7413 STMT_VINFO_DEF_TYPE (tem) = vect_internal_def; 7414 } 7415 else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)) 7416 { 7417 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo); 7418 internal_fn cond_fn = get_conditional_internal_fn (op.code, op.type); 7419 7420 if (reduction_type != FOLD_LEFT_REDUCTION 7421 && !use_mask_by_cond_expr_p (op.code, cond_fn, vectype_in) 7422 && (cond_fn == IFN_LAST 7423 || !direct_internal_fn_supported_p (cond_fn, vectype_in, 7424 OPTIMIZE_FOR_SPEED))) 7425 { 7426 if (dump_enabled_p ()) 7427 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 7428 "can't operate on partial vectors because" 7429 " no conditional operation is available.\n"); 7430 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; 7431 } 7432 else if (reduction_type == FOLD_LEFT_REDUCTION 7433 && reduc_fn == IFN_LAST 7434 && !expand_vec_cond_expr_p (vectype_in, 7435 truth_type_for (vectype_in), 7436 SSA_NAME)) 7437 { 7438 if (dump_enabled_p ()) 7439 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 7440 "can't operate on partial vectors because" 7441 " no conditional operation is available.\n"); 7442 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; 7443 } 7444 else 7445 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num, 7446 vectype_in, NULL); 7447 } 7448 return true; 7449 } 7450 7451 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge 7452 value. */ 7453 7454 bool 7455 vect_transform_reduction (loop_vec_info loop_vinfo, 7456 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, 7457 gimple **vec_stmt, slp_tree slp_node) 7458 { 7459 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info); 7460 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 7461 int i; 7462 int ncopies; 7463 int vec_num; 7464 7465 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info); 7466 gcc_assert (reduc_info->is_reduc_info); 7467 7468 if (nested_in_vect_loop_p (loop, stmt_info)) 7469 { 7470 loop = loop->inner; 7471 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def); 7472 } 7473 7474 gimple_match_op op; 7475 if (!gimple_extract_op (stmt_info->stmt, &op)) 7476 gcc_unreachable (); 7477 7478 /* All uses but the last are expected to be defined in the loop. 7479 The last use is the reduction variable. In case of nested cycle this 7480 assumption is not true: we use reduc_index to record the index of the 7481 reduction variable. */ 7482 stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)); 7483 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt); 7484 int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info); 7485 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info); 7486 7487 if (slp_node) 7488 { 7489 ncopies = 1; 7490 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); 7491 } 7492 else 7493 { 7494 ncopies = vect_get_num_copies (loop_vinfo, vectype_in); 7495 vec_num = 1; 7496 } 7497 7498 code_helper code = canonicalize_code (op.code, op.type); 7499 internal_fn cond_fn = get_conditional_internal_fn (code, op.type); 7500 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo); 7501 bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in); 7502 7503 /* Transform. */ 7504 tree new_temp = NULL_TREE; 7505 auto_vec<tree> vec_oprnds0; 7506 auto_vec<tree> vec_oprnds1; 7507 auto_vec<tree> vec_oprnds2; 7508 tree def0; 7509 7510 if (dump_enabled_p ()) 7511 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n"); 7512 7513 /* FORNOW: Multiple types are not supported for condition. */ 7514 if (code == COND_EXPR) 7515 gcc_assert (ncopies == 1); 7516 7517 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo); 7518 7519 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info); 7520 if (reduction_type == FOLD_LEFT_REDUCTION) 7521 { 7522 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info); 7523 gcc_assert (code.is_tree_code ()); 7524 return vectorize_fold_left_reduction 7525 (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, 7526 tree_code (code), reduc_fn, op.ops, vectype_in, reduc_index, masks); 7527 } 7528 7529 bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info); 7530 gcc_assert (single_defuse_cycle 7531 || code == DOT_PROD_EXPR 7532 || code == WIDEN_SUM_EXPR 7533 || code == SAD_EXPR); 7534 7535 /* Create the destination vector */ 7536 tree scalar_dest = gimple_get_lhs (stmt_info->stmt); 7537 tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out); 7538 7539 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies, 7540 single_defuse_cycle && reduc_index == 0 7541 ? NULL_TREE : op.ops[0], &vec_oprnds0, 7542 single_defuse_cycle && reduc_index == 1 7543 ? NULL_TREE : op.ops[1], &vec_oprnds1, 7544 op.num_ops == 3 7545 && !(single_defuse_cycle && reduc_index == 2) 7546 ? op.ops[2] : NULL_TREE, &vec_oprnds2); 7547 if (single_defuse_cycle) 7548 { 7549 gcc_assert (!slp_node); 7550 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1, 7551 op.ops[reduc_index], 7552 reduc_index == 0 ? &vec_oprnds0 7553 : (reduc_index == 1 ? &vec_oprnds1 7554 : &vec_oprnds2)); 7555 } 7556 7557 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0) 7558 { 7559 gimple *new_stmt; 7560 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE }; 7561 if (masked_loop_p && !mask_by_cond_expr) 7562 { 7563 /* Make sure that the reduction accumulator is vop[0]. */ 7564 if (reduc_index == 1) 7565 { 7566 gcc_assert (commutative_binary_op_p (code, op.type)); 7567 std::swap (vop[0], vop[1]); 7568 } 7569 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies, 7570 vectype_in, i); 7571 gcall *call = gimple_build_call_internal (cond_fn, 4, mask, 7572 vop[0], vop[1], vop[0]); 7573 new_temp = make_ssa_name (vec_dest, call); 7574 gimple_call_set_lhs (call, new_temp); 7575 gimple_call_set_nothrow (call, true); 7576 vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi); 7577 new_stmt = call; 7578 } 7579 else 7580 { 7581 if (op.num_ops == 3) 7582 vop[2] = vec_oprnds2[i]; 7583 7584 if (masked_loop_p && mask_by_cond_expr) 7585 { 7586 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies, 7587 vectype_in, i); 7588 build_vect_cond_expr (code, vop, mask, gsi); 7589 } 7590 7591 if (code.is_internal_fn ()) 7592 new_stmt = gimple_build_call_internal (internal_fn (code), 7593 op.num_ops, 7594 vop[0], vop[1], vop[2]); 7595 else 7596 new_stmt = gimple_build_assign (vec_dest, tree_code (op.code), 7597 vop[0], vop[1], vop[2]); 7598 new_temp = make_ssa_name (vec_dest, new_stmt); 7599 gimple_set_lhs (new_stmt, new_temp); 7600 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi); 7601 } 7602 7603 if (slp_node) 7604 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt); 7605 else if (single_defuse_cycle 7606 && i < ncopies - 1) 7607 { 7608 if (reduc_index == 0) 7609 vec_oprnds0.safe_push (gimple_get_lhs (new_stmt)); 7610 else if (reduc_index == 1) 7611 vec_oprnds1.safe_push (gimple_get_lhs (new_stmt)); 7612 else if (reduc_index == 2) 7613 vec_oprnds2.safe_push (gimple_get_lhs (new_stmt)); 7614 } 7615 else 7616 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt); 7617 } 7618 7619 if (!slp_node) 7620 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0]; 7621 7622 return true; 7623 } 7624 7625 /* Transform phase of a cycle PHI. */ 7626 7627 bool 7628 vect_transform_cycle_phi (loop_vec_info loop_vinfo, 7629 stmt_vec_info stmt_info, gimple **vec_stmt, 7630 slp_tree slp_node, slp_instance slp_node_instance) 7631 { 7632 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info); 7633 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 7634 int i; 7635 int ncopies; 7636 int j; 7637 bool nested_cycle = false; 7638 int vec_num; 7639 7640 if (nested_in_vect_loop_p (loop, stmt_info)) 7641 { 7642 loop = loop->inner; 7643 nested_cycle = true; 7644 } 7645 7646 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info); 7647 reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info); 7648 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info); 7649 gcc_assert (reduc_info->is_reduc_info); 7650 7651 if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION 7652 || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION) 7653 /* Leave the scalar phi in place. */ 7654 return true; 7655 7656 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info); 7657 /* For a nested cycle we do not fill the above. */ 7658 if (!vectype_in) 7659 vectype_in = STMT_VINFO_VECTYPE (stmt_info); 7660 gcc_assert (vectype_in); 7661 7662 if (slp_node) 7663 { 7664 /* The size vect_schedule_slp_instance computes is off for us. */ 7665 vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo) 7666 * SLP_TREE_LANES (slp_node), vectype_in); 7667 ncopies = 1; 7668 } 7669 else 7670 { 7671 vec_num = 1; 7672 ncopies = vect_get_num_copies (loop_vinfo, vectype_in); 7673 } 7674 7675 /* Check whether we should use a single PHI node and accumulate 7676 vectors to one before the backedge. */ 7677 if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info)) 7678 ncopies = 1; 7679 7680 /* Create the destination vector */ 7681 gphi *phi = as_a <gphi *> (stmt_info->stmt); 7682 tree vec_dest = vect_create_destination_var (gimple_phi_result (phi), 7683 vectype_out); 7684 7685 /* Get the loop-entry arguments. */ 7686 tree vec_initial_def = NULL_TREE; 7687 auto_vec<tree> vec_initial_defs; 7688 if (slp_node) 7689 { 7690 vec_initial_defs.reserve (vec_num); 7691 if (nested_cycle) 7692 { 7693 unsigned phi_idx = loop_preheader_edge (loop)->dest_idx; 7694 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx], 7695 &vec_initial_defs); 7696 } 7697 else 7698 { 7699 gcc_assert (slp_node == slp_node_instance->reduc_phis); 7700 vec<tree> &initial_values = reduc_info->reduc_initial_values; 7701 vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node); 7702 7703 unsigned int num_phis = stmts.length (); 7704 if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info)) 7705 num_phis = 1; 7706 initial_values.reserve (num_phis); 7707 for (unsigned int i = 0; i < num_phis; ++i) 7708 { 7709 gphi *this_phi = as_a<gphi *> (stmts[i]->stmt); 7710 initial_values.quick_push (vect_phi_initial_value (this_phi)); 7711 } 7712 if (vec_num == 1) 7713 vect_find_reusable_accumulator (loop_vinfo, reduc_info); 7714 if (!initial_values.is_empty ()) 7715 { 7716 tree initial_value 7717 = (num_phis == 1 ? initial_values[0] : NULL_TREE); 7718 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info); 7719 tree neutral_op 7720 = neutral_op_for_reduction (TREE_TYPE (vectype_out), 7721 code, initial_value); 7722 get_initial_defs_for_reduction (loop_vinfo, reduc_info, 7723 &vec_initial_defs, vec_num, 7724 stmts.length (), neutral_op); 7725 } 7726 } 7727 } 7728 else 7729 { 7730 /* Get at the scalar def before the loop, that defines the initial 7731 value of the reduction variable. */ 7732 tree initial_def = vect_phi_initial_value (phi); 7733 reduc_info->reduc_initial_values.safe_push (initial_def); 7734 /* Optimize: if initial_def is for REDUC_MAX smaller than the base 7735 and we can't use zero for induc_val, use initial_def. Similarly 7736 for REDUC_MIN and initial_def larger than the base. */ 7737 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION) 7738 { 7739 tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info); 7740 if (TREE_CODE (initial_def) == INTEGER_CST 7741 && !integer_zerop (induc_val) 7742 && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR 7743 && tree_int_cst_lt (initial_def, induc_val)) 7744 || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR 7745 && tree_int_cst_lt (induc_val, initial_def)))) 7746 { 7747 induc_val = initial_def; 7748 /* Communicate we used the initial_def to epilouge 7749 generation. */ 7750 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE; 7751 } 7752 vec_initial_def = build_vector_from_val (vectype_out, induc_val); 7753 } 7754 else if (nested_cycle) 7755 { 7756 /* Do not use an adjustment def as that case is not supported 7757 correctly if ncopies is not one. */ 7758 vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info, 7759 ncopies, initial_def, 7760 &vec_initial_defs); 7761 } 7762 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION 7763 || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION) 7764 /* Fill the initial vector with the initial scalar value. */ 7765 vec_initial_def 7766 = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info, 7767 initial_def, initial_def); 7768 else 7769 { 7770 if (ncopies == 1) 7771 vect_find_reusable_accumulator (loop_vinfo, reduc_info); 7772 if (!reduc_info->reduc_initial_values.is_empty ()) 7773 { 7774 initial_def = reduc_info->reduc_initial_values[0]; 7775 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info); 7776 tree neutral_op 7777 = neutral_op_for_reduction (TREE_TYPE (initial_def), 7778 code, initial_def); 7779 gcc_assert (neutral_op); 7780 /* Try to simplify the vector initialization by applying an 7781 adjustment after the reduction has been performed. */ 7782 if (!reduc_info->reused_accumulator 7783 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def 7784 && !operand_equal_p (neutral_op, initial_def)) 7785 { 7786 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) 7787 = initial_def; 7788 initial_def = neutral_op; 7789 } 7790 vec_initial_def 7791 = get_initial_def_for_reduction (loop_vinfo, reduc_info, 7792 initial_def, neutral_op); 7793 } 7794 } 7795 } 7796 7797 if (vec_initial_def) 7798 { 7799 vec_initial_defs.create (ncopies); 7800 for (i = 0; i < ncopies; ++i) 7801 vec_initial_defs.quick_push (vec_initial_def); 7802 } 7803 7804 if (auto *accumulator = reduc_info->reused_accumulator) 7805 { 7806 tree def = accumulator->reduc_input; 7807 if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def))) 7808 { 7809 unsigned int nreduc; 7810 bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS 7811 (TREE_TYPE (def)), 7812 TYPE_VECTOR_SUBPARTS (vectype_out), 7813 &nreduc); 7814 gcc_assert (res); 7815 gimple_seq stmts = NULL; 7816 /* Reduce the single vector to a smaller one. */ 7817 if (nreduc != 1) 7818 { 7819 /* Perform the reduction in the appropriate type. */ 7820 tree rvectype = vectype_out; 7821 if (!useless_type_conversion_p (TREE_TYPE (vectype_out), 7822 TREE_TYPE (TREE_TYPE (def)))) 7823 rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)), 7824 TYPE_VECTOR_SUBPARTS 7825 (vectype_out)); 7826 def = vect_create_partial_epilog (def, rvectype, 7827 STMT_VINFO_REDUC_CODE 7828 (reduc_info), 7829 &stmts); 7830 } 7831 /* The epilogue loop might use a different vector mode, like 7832 VNx2DI vs. V2DI. */ 7833 if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def))) 7834 { 7835 tree reduc_type = build_vector_type_for_mode 7836 (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out)); 7837 def = gimple_convert (&stmts, reduc_type, def); 7838 } 7839 /* Adjust the input so we pick up the partially reduced value 7840 for the skip edge in vect_create_epilog_for_reduction. */ 7841 accumulator->reduc_input = def; 7842 /* And the reduction could be carried out using a different sign. */ 7843 if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def))) 7844 def = gimple_convert (&stmts, vectype_out, def); 7845 if (loop_vinfo->main_loop_edge) 7846 { 7847 /* While we'd like to insert on the edge this will split 7848 blocks and disturb bookkeeping, we also will eventually 7849 need this on the skip edge. Rely on sinking to 7850 fixup optimal placement and insert in the pred. */ 7851 gimple_stmt_iterator gsi 7852 = gsi_last_bb (loop_vinfo->main_loop_edge->src); 7853 /* Insert before a cond that eventually skips the 7854 epilogue. */ 7855 if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi))) 7856 gsi_prev (&gsi); 7857 gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING); 7858 } 7859 else 7860 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), 7861 stmts); 7862 } 7863 if (loop_vinfo->main_loop_edge) 7864 vec_initial_defs[0] 7865 = vect_get_main_loop_result (loop_vinfo, def, 7866 vec_initial_defs[0]); 7867 else 7868 vec_initial_defs.safe_push (def); 7869 } 7870 7871 /* Generate the reduction PHIs upfront. */ 7872 for (i = 0; i < vec_num; i++) 7873 { 7874 tree vec_init_def = vec_initial_defs[i]; 7875 for (j = 0; j < ncopies; j++) 7876 { 7877 /* Create the reduction-phi that defines the reduction 7878 operand. */ 7879 gphi *new_phi = create_phi_node (vec_dest, loop->header); 7880 7881 /* Set the loop-entry arg of the reduction-phi. */ 7882 if (j != 0 && nested_cycle) 7883 vec_init_def = vec_initial_defs[j]; 7884 add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop), 7885 UNKNOWN_LOCATION); 7886 7887 /* The loop-latch arg is set in epilogue processing. */ 7888 7889 if (slp_node) 7890 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi); 7891 else 7892 { 7893 if (j == 0) 7894 *vec_stmt = new_phi; 7895 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi); 7896 } 7897 } 7898 } 7899 7900 return true; 7901 } 7902 7903 /* Vectorizes LC PHIs. */ 7904 7905 bool 7906 vectorizable_lc_phi (loop_vec_info loop_vinfo, 7907 stmt_vec_info stmt_info, gimple **vec_stmt, 7908 slp_tree slp_node) 7909 { 7910 if (!loop_vinfo 7911 || !is_a <gphi *> (stmt_info->stmt) 7912 || gimple_phi_num_args (stmt_info->stmt) != 1) 7913 return false; 7914 7915 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def 7916 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def) 7917 return false; 7918 7919 if (!vec_stmt) /* transformation not required. */ 7920 { 7921 /* Deal with copies from externs or constants that disguise as 7922 loop-closed PHI nodes (PR97886). */ 7923 if (slp_node 7924 && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0], 7925 SLP_TREE_VECTYPE (slp_node))) 7926 { 7927 if (dump_enabled_p ()) 7928 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 7929 "incompatible vector types for invariants\n"); 7930 return false; 7931 } 7932 STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type; 7933 return true; 7934 } 7935 7936 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 7937 tree scalar_dest = gimple_phi_result (stmt_info->stmt); 7938 basic_block bb = gimple_bb (stmt_info->stmt); 7939 edge e = single_pred_edge (bb); 7940 tree vec_dest = vect_create_destination_var (scalar_dest, vectype); 7941 auto_vec<tree> vec_oprnds; 7942 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, 7943 !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1, 7944 gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds); 7945 for (unsigned i = 0; i < vec_oprnds.length (); i++) 7946 { 7947 /* Create the vectorized LC PHI node. */ 7948 gphi *new_phi = create_phi_node (vec_dest, bb); 7949 add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION); 7950 if (slp_node) 7951 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi); 7952 else 7953 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi); 7954 } 7955 if (!slp_node) 7956 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0]; 7957 7958 return true; 7959 } 7960 7961 /* Vectorizes PHIs. */ 7962 7963 bool 7964 vectorizable_phi (vec_info *, 7965 stmt_vec_info stmt_info, gimple **vec_stmt, 7966 slp_tree slp_node, stmt_vector_for_cost *cost_vec) 7967 { 7968 if (!is_a <gphi *> (stmt_info->stmt) || !slp_node) 7969 return false; 7970 7971 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def) 7972 return false; 7973 7974 tree vectype = SLP_TREE_VECTYPE (slp_node); 7975 7976 if (!vec_stmt) /* transformation not required. */ 7977 { 7978 slp_tree child; 7979 unsigned i; 7980 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child) 7981 if (!child) 7982 { 7983 if (dump_enabled_p ()) 7984 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 7985 "PHI node with unvectorized backedge def\n"); 7986 return false; 7987 } 7988 else if (!vect_maybe_update_slp_op_vectype (child, vectype)) 7989 { 7990 if (dump_enabled_p ()) 7991 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 7992 "incompatible vector types for invariants\n"); 7993 return false; 7994 } 7995 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def 7996 && !useless_type_conversion_p (vectype, 7997 SLP_TREE_VECTYPE (child))) 7998 { 7999 /* With bools we can have mask and non-mask precision vectors 8000 or different non-mask precisions. while pattern recog is 8001 supposed to guarantee consistency here bugs in it can cause 8002 mismatches (PR103489 and PR103800 for example). 8003 Deal with them here instead of ICEing later. */ 8004 if (dump_enabled_p ()) 8005 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 8006 "incompatible vector type setup from " 8007 "bool pattern detection\n"); 8008 return false; 8009 } 8010 8011 /* For single-argument PHIs assume coalescing which means zero cost 8012 for the scalar and the vector PHIs. This avoids artificially 8013 favoring the vector path (but may pessimize it in some cases). */ 8014 if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1) 8015 record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node), 8016 vector_stmt, stmt_info, vectype, 0, vect_body); 8017 STMT_VINFO_TYPE (stmt_info) = phi_info_type; 8018 return true; 8019 } 8020 8021 tree scalar_dest = gimple_phi_result (stmt_info->stmt); 8022 basic_block bb = gimple_bb (stmt_info->stmt); 8023 tree vec_dest = vect_create_destination_var (scalar_dest, vectype); 8024 auto_vec<gphi *> new_phis; 8025 for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i) 8026 { 8027 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i]; 8028 8029 /* Skip not yet vectorized defs. */ 8030 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def 8031 && SLP_TREE_VEC_STMTS (child).is_empty ()) 8032 continue; 8033 8034 auto_vec<tree> vec_oprnds; 8035 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds); 8036 if (!new_phis.exists ()) 8037 { 8038 new_phis.create (vec_oprnds.length ()); 8039 for (unsigned j = 0; j < vec_oprnds.length (); j++) 8040 { 8041 /* Create the vectorized LC PHI node. */ 8042 new_phis.quick_push (create_phi_node (vec_dest, bb)); 8043 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phis[j]); 8044 } 8045 } 8046 edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i); 8047 for (unsigned j = 0; j < vec_oprnds.length (); j++) 8048 add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION); 8049 } 8050 /* We should have at least one already vectorized child. */ 8051 gcc_assert (new_phis.exists ()); 8052 8053 return true; 8054 } 8055 8056 /* Return true if VECTYPE represents a vector that requires lowering 8057 by the vector lowering pass. */ 8058 8059 bool 8060 vect_emulated_vector_p (tree vectype) 8061 { 8062 return (!VECTOR_MODE_P (TYPE_MODE (vectype)) 8063 && (!VECTOR_BOOLEAN_TYPE_P (vectype) 8064 || TYPE_PRECISION (TREE_TYPE (vectype)) != 1)); 8065 } 8066 8067 /* Return true if we can emulate CODE on an integer mode representation 8068 of a vector. */ 8069 8070 bool 8071 vect_can_vectorize_without_simd_p (tree_code code) 8072 { 8073 switch (code) 8074 { 8075 case PLUS_EXPR: 8076 case MINUS_EXPR: 8077 case NEGATE_EXPR: 8078 case BIT_AND_EXPR: 8079 case BIT_IOR_EXPR: 8080 case BIT_XOR_EXPR: 8081 case BIT_NOT_EXPR: 8082 return true; 8083 8084 default: 8085 return false; 8086 } 8087 } 8088 8089 /* Likewise, but taking a code_helper. */ 8090 8091 bool 8092 vect_can_vectorize_without_simd_p (code_helper code) 8093 { 8094 return (code.is_tree_code () 8095 && vect_can_vectorize_without_simd_p (tree_code (code))); 8096 } 8097 8098 /* Function vectorizable_induction 8099 8100 Check if STMT_INFO performs an induction computation that can be vectorized. 8101 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized 8102 phi to replace it, put it in VEC_STMT, and add it to the same basic block. 8103 Return true if STMT_INFO is vectorizable in this way. */ 8104 8105 bool 8106 vectorizable_induction (loop_vec_info loop_vinfo, 8107 stmt_vec_info stmt_info, 8108 gimple **vec_stmt, slp_tree slp_node, 8109 stmt_vector_for_cost *cost_vec) 8110 { 8111 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 8112 unsigned ncopies; 8113 bool nested_in_vect_loop = false; 8114 class loop *iv_loop; 8115 tree vec_def; 8116 edge pe = loop_preheader_edge (loop); 8117 basic_block new_bb; 8118 tree new_vec, vec_init, vec_step, t; 8119 tree new_name; 8120 gimple *new_stmt; 8121 gphi *induction_phi; 8122 tree induc_def, vec_dest; 8123 tree init_expr, step_expr; 8124 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 8125 unsigned i; 8126 tree expr; 8127 gimple_stmt_iterator si; 8128 8129 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt); 8130 if (!phi) 8131 return false; 8132 8133 if (!STMT_VINFO_RELEVANT_P (stmt_info)) 8134 return false; 8135 8136 /* Make sure it was recognized as induction computation. */ 8137 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def) 8138 return false; 8139 8140 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 8141 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype); 8142 8143 if (slp_node) 8144 ncopies = 1; 8145 else 8146 ncopies = vect_get_num_copies (loop_vinfo, vectype); 8147 gcc_assert (ncopies >= 1); 8148 8149 /* FORNOW. These restrictions should be relaxed. */ 8150 if (nested_in_vect_loop_p (loop, stmt_info)) 8151 { 8152 imm_use_iterator imm_iter; 8153 use_operand_p use_p; 8154 gimple *exit_phi; 8155 edge latch_e; 8156 tree loop_arg; 8157 8158 if (ncopies > 1) 8159 { 8160 if (dump_enabled_p ()) 8161 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 8162 "multiple types in nested loop.\n"); 8163 return false; 8164 } 8165 8166 exit_phi = NULL; 8167 latch_e = loop_latch_edge (loop->inner); 8168 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e); 8169 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg) 8170 { 8171 gimple *use_stmt = USE_STMT (use_p); 8172 if (is_gimple_debug (use_stmt)) 8173 continue; 8174 8175 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt))) 8176 { 8177 exit_phi = use_stmt; 8178 break; 8179 } 8180 } 8181 if (exit_phi) 8182 { 8183 stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi); 8184 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo) 8185 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))) 8186 { 8187 if (dump_enabled_p ()) 8188 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 8189 "inner-loop induction only used outside " 8190 "of the outer vectorized loop.\n"); 8191 return false; 8192 } 8193 } 8194 8195 nested_in_vect_loop = true; 8196 iv_loop = loop->inner; 8197 } 8198 else 8199 iv_loop = loop; 8200 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father); 8201 8202 if (slp_node && !nunits.is_constant ()) 8203 { 8204 /* The current SLP code creates the step value element-by-element. */ 8205 if (dump_enabled_p ()) 8206 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 8207 "SLP induction not supported for variable-length" 8208 " vectors.\n"); 8209 return false; 8210 } 8211 8212 if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float) 8213 { 8214 if (dump_enabled_p ()) 8215 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 8216 "floating point induction vectorization disabled\n"); 8217 return false; 8218 } 8219 8220 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info); 8221 gcc_assert (step_expr != NULL_TREE); 8222 tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype); 8223 8224 /* Check for backend support of PLUS/MINUS_EXPR. */ 8225 if (!directly_supported_p (PLUS_EXPR, step_vectype) 8226 || !directly_supported_p (MINUS_EXPR, step_vectype)) 8227 return false; 8228 8229 if (!vec_stmt) /* transformation not required. */ 8230 { 8231 unsigned inside_cost = 0, prologue_cost = 0; 8232 if (slp_node) 8233 { 8234 /* We eventually need to set a vector type on invariant 8235 arguments. */ 8236 unsigned j; 8237 slp_tree child; 8238 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child) 8239 if (!vect_maybe_update_slp_op_vectype 8240 (child, SLP_TREE_VECTYPE (slp_node))) 8241 { 8242 if (dump_enabled_p ()) 8243 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 8244 "incompatible vector types for " 8245 "invariants\n"); 8246 return false; 8247 } 8248 /* loop cost for vec_loop. */ 8249 inside_cost 8250 = record_stmt_cost (cost_vec, 8251 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node), 8252 vector_stmt, stmt_info, 0, vect_body); 8253 /* prologue cost for vec_init (if not nested) and step. */ 8254 prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop, 8255 scalar_to_vec, 8256 stmt_info, 0, vect_prologue); 8257 } 8258 else /* if (!slp_node) */ 8259 { 8260 /* loop cost for vec_loop. */ 8261 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt, 8262 stmt_info, 0, vect_body); 8263 /* prologue cost for vec_init and vec_step. */ 8264 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec, 8265 stmt_info, 0, vect_prologue); 8266 } 8267 if (dump_enabled_p ()) 8268 dump_printf_loc (MSG_NOTE, vect_location, 8269 "vect_model_induction_cost: inside_cost = %d, " 8270 "prologue_cost = %d .\n", inside_cost, 8271 prologue_cost); 8272 8273 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type; 8274 DUMP_VECT_SCOPE ("vectorizable_induction"); 8275 return true; 8276 } 8277 8278 /* Transform. */ 8279 8280 /* Compute a vector variable, initialized with the first VF values of 8281 the induction variable. E.g., for an iv with IV_PHI='X' and 8282 evolution S, for a vector of 4 units, we want to compute: 8283 [X, X + S, X + 2*S, X + 3*S]. */ 8284 8285 if (dump_enabled_p ()) 8286 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n"); 8287 8288 pe = loop_preheader_edge (iv_loop); 8289 /* Find the first insertion point in the BB. */ 8290 basic_block bb = gimple_bb (phi); 8291 si = gsi_after_labels (bb); 8292 8293 /* For SLP induction we have to generate several IVs as for example 8294 with group size 3 we need 8295 [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1] 8296 [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2]. */ 8297 if (slp_node) 8298 { 8299 /* Enforced above. */ 8300 unsigned int const_nunits = nunits.to_constant (); 8301 8302 /* The initial values are vectorized, but any lanes > group_size 8303 need adjustment. */ 8304 slp_tree init_node 8305 = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx]; 8306 8307 /* Gather steps. Since we do not vectorize inductions as 8308 cycles we have to reconstruct the step from SCEV data. */ 8309 unsigned group_size = SLP_TREE_LANES (slp_node); 8310 tree *steps = XALLOCAVEC (tree, group_size); 8311 tree *inits = XALLOCAVEC (tree, group_size); 8312 stmt_vec_info phi_info; 8313 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info) 8314 { 8315 steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info); 8316 if (!init_node) 8317 inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt), 8318 pe->dest_idx); 8319 } 8320 8321 /* Now generate the IVs. */ 8322 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); 8323 gcc_assert ((const_nunits * nvects) % group_size == 0); 8324 unsigned nivs; 8325 if (nested_in_vect_loop) 8326 nivs = nvects; 8327 else 8328 { 8329 /* Compute the number of distinct IVs we need. First reduce 8330 group_size if it is a multiple of const_nunits so we get 8331 one IV for a group_size of 4 but const_nunits 2. */ 8332 unsigned group_sizep = group_size; 8333 if (group_sizep % const_nunits == 0) 8334 group_sizep = group_sizep / const_nunits; 8335 nivs = least_common_multiple (group_sizep, 8336 const_nunits) / const_nunits; 8337 } 8338 tree stept = TREE_TYPE (step_vectype); 8339 tree lupdate_mul = NULL_TREE; 8340 if (!nested_in_vect_loop) 8341 { 8342 /* The number of iterations covered in one vector iteration. */ 8343 unsigned lup_mul = (nvects * const_nunits) / group_size; 8344 lupdate_mul 8345 = build_vector_from_val (step_vectype, 8346 SCALAR_FLOAT_TYPE_P (stept) 8347 ? build_real_from_wide (stept, lup_mul, 8348 UNSIGNED) 8349 : build_int_cstu (stept, lup_mul)); 8350 } 8351 tree peel_mul = NULL_TREE; 8352 gimple_seq init_stmts = NULL; 8353 if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo)) 8354 { 8355 if (SCALAR_FLOAT_TYPE_P (stept)) 8356 peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept, 8357 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo)); 8358 else 8359 peel_mul = gimple_convert (&init_stmts, stept, 8360 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo)); 8361 peel_mul = gimple_build_vector_from_val (&init_stmts, 8362 step_vectype, peel_mul); 8363 } 8364 unsigned ivn; 8365 auto_vec<tree> vec_steps; 8366 for (ivn = 0; ivn < nivs; ++ivn) 8367 { 8368 tree_vector_builder step_elts (step_vectype, const_nunits, 1); 8369 tree_vector_builder init_elts (vectype, const_nunits, 1); 8370 tree_vector_builder mul_elts (step_vectype, const_nunits, 1); 8371 for (unsigned eltn = 0; eltn < const_nunits; ++eltn) 8372 { 8373 /* The scalar steps of the IVs. */ 8374 tree elt = steps[(ivn*const_nunits + eltn) % group_size]; 8375 elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt); 8376 step_elts.quick_push (elt); 8377 if (!init_node) 8378 { 8379 /* The scalar inits of the IVs if not vectorized. */ 8380 elt = inits[(ivn*const_nunits + eltn) % group_size]; 8381 if (!useless_type_conversion_p (TREE_TYPE (vectype), 8382 TREE_TYPE (elt))) 8383 elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR, 8384 TREE_TYPE (vectype), elt); 8385 init_elts.quick_push (elt); 8386 } 8387 /* The number of steps to add to the initial values. */ 8388 unsigned mul_elt = (ivn*const_nunits + eltn) / group_size; 8389 mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept) 8390 ? build_real_from_wide (stept, 8391 mul_elt, UNSIGNED) 8392 : build_int_cstu (stept, mul_elt)); 8393 } 8394 vec_step = gimple_build_vector (&init_stmts, &step_elts); 8395 vec_steps.safe_push (vec_step); 8396 tree step_mul = gimple_build_vector (&init_stmts, &mul_elts); 8397 if (peel_mul) 8398 step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype, 8399 step_mul, peel_mul); 8400 if (!init_node) 8401 vec_init = gimple_build_vector (&init_stmts, &init_elts); 8402 8403 /* Create the induction-phi that defines the induction-operand. */ 8404 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, 8405 "vec_iv_"); 8406 induction_phi = create_phi_node (vec_dest, iv_loop->header); 8407 induc_def = PHI_RESULT (induction_phi); 8408 8409 /* Create the iv update inside the loop */ 8410 tree up = vec_step; 8411 if (lupdate_mul) 8412 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype, 8413 vec_step, lupdate_mul); 8414 gimple_seq stmts = NULL; 8415 vec_def = gimple_convert (&stmts, step_vectype, induc_def); 8416 vec_def = gimple_build (&stmts, 8417 PLUS_EXPR, step_vectype, vec_def, up); 8418 vec_def = gimple_convert (&stmts, vectype, vec_def); 8419 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT); 8420 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop), 8421 UNKNOWN_LOCATION); 8422 8423 if (init_node) 8424 vec_init = vect_get_slp_vect_def (init_node, ivn); 8425 if (!nested_in_vect_loop 8426 && !integer_zerop (step_mul)) 8427 { 8428 vec_def = gimple_convert (&init_stmts, step_vectype, vec_init); 8429 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype, 8430 vec_step, step_mul); 8431 vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype, 8432 vec_def, up); 8433 vec_init = gimple_convert (&init_stmts, vectype, vec_def); 8434 } 8435 8436 /* Set the arguments of the phi node: */ 8437 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION); 8438 8439 SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi); 8440 } 8441 if (!nested_in_vect_loop) 8442 { 8443 /* Fill up to the number of vectors we need for the whole group. */ 8444 nivs = least_common_multiple (group_size, 8445 const_nunits) / const_nunits; 8446 vec_steps.reserve (nivs-ivn); 8447 for (; ivn < nivs; ++ivn) 8448 { 8449 SLP_TREE_VEC_STMTS (slp_node) 8450 .quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]); 8451 vec_steps.quick_push (vec_steps[0]); 8452 } 8453 } 8454 8455 /* Re-use IVs when we can. We are generating further vector 8456 stmts by adding VF' * stride to the IVs generated above. */ 8457 if (ivn < nvects) 8458 { 8459 unsigned vfp 8460 = least_common_multiple (group_size, const_nunits) / group_size; 8461 tree lupdate_mul 8462 = build_vector_from_val (step_vectype, 8463 SCALAR_FLOAT_TYPE_P (stept) 8464 ? build_real_from_wide (stept, 8465 vfp, UNSIGNED) 8466 : build_int_cstu (stept, vfp)); 8467 for (; ivn < nvects; ++ivn) 8468 { 8469 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs]; 8470 tree def = gimple_get_lhs (iv); 8471 if (ivn < 2*nivs) 8472 vec_steps[ivn - nivs] 8473 = gimple_build (&init_stmts, MULT_EXPR, step_vectype, 8474 vec_steps[ivn - nivs], lupdate_mul); 8475 gimple_seq stmts = NULL; 8476 def = gimple_convert (&stmts, step_vectype, def); 8477 def = gimple_build (&stmts, PLUS_EXPR, step_vectype, 8478 def, vec_steps[ivn % nivs]); 8479 def = gimple_convert (&stmts, vectype, def); 8480 if (gimple_code (iv) == GIMPLE_PHI) 8481 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT); 8482 else 8483 { 8484 gimple_stmt_iterator tgsi = gsi_for_stmt (iv); 8485 gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING); 8486 } 8487 SLP_TREE_VEC_STMTS (slp_node) 8488 .quick_push (SSA_NAME_DEF_STMT (def)); 8489 } 8490 } 8491 8492 new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts); 8493 gcc_assert (!new_bb); 8494 8495 return true; 8496 } 8497 8498 init_expr = vect_phi_initial_value (phi); 8499 8500 gimple_seq stmts = NULL; 8501 if (!nested_in_vect_loop) 8502 { 8503 /* Convert the initial value to the IV update type. */ 8504 tree new_type = TREE_TYPE (step_expr); 8505 init_expr = gimple_convert (&stmts, new_type, init_expr); 8506 8507 /* If we are using the loop mask to "peel" for alignment then we need 8508 to adjust the start value here. */ 8509 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo); 8510 if (skip_niters != NULL_TREE) 8511 { 8512 if (FLOAT_TYPE_P (vectype)) 8513 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type, 8514 skip_niters); 8515 else 8516 skip_niters = gimple_convert (&stmts, new_type, skip_niters); 8517 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type, 8518 skip_niters, step_expr); 8519 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type, 8520 init_expr, skip_step); 8521 } 8522 } 8523 8524 if (stmts) 8525 { 8526 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); 8527 gcc_assert (!new_bb); 8528 } 8529 8530 /* Create the vector that holds the initial_value of the induction. */ 8531 if (nested_in_vect_loop) 8532 { 8533 /* iv_loop is nested in the loop to be vectorized. init_expr had already 8534 been created during vectorization of previous stmts. We obtain it 8535 from the STMT_VINFO_VEC_STMT of the defining stmt. */ 8536 auto_vec<tree> vec_inits; 8537 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1, 8538 init_expr, &vec_inits); 8539 vec_init = vec_inits[0]; 8540 /* If the initial value is not of proper type, convert it. */ 8541 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init))) 8542 { 8543 new_stmt 8544 = gimple_build_assign (vect_get_new_ssa_name (vectype, 8545 vect_simple_var, 8546 "vec_iv_"), 8547 VIEW_CONVERT_EXPR, 8548 build1 (VIEW_CONVERT_EXPR, vectype, 8549 vec_init)); 8550 vec_init = gimple_assign_lhs (new_stmt); 8551 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop), 8552 new_stmt); 8553 gcc_assert (!new_bb); 8554 } 8555 } 8556 else 8557 { 8558 /* iv_loop is the loop to be vectorized. Create: 8559 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */ 8560 stmts = NULL; 8561 new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr); 8562 8563 unsigned HOST_WIDE_INT const_nunits; 8564 if (nunits.is_constant (&const_nunits)) 8565 { 8566 tree_vector_builder elts (step_vectype, const_nunits, 1); 8567 elts.quick_push (new_name); 8568 for (i = 1; i < const_nunits; i++) 8569 { 8570 /* Create: new_name_i = new_name + step_expr */ 8571 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name), 8572 new_name, step_expr); 8573 elts.quick_push (new_name); 8574 } 8575 /* Create a vector from [new_name_0, new_name_1, ..., 8576 new_name_nunits-1] */ 8577 vec_init = gimple_build_vector (&stmts, &elts); 8578 } 8579 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr))) 8580 /* Build the initial value directly from a VEC_SERIES_EXPR. */ 8581 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype, 8582 new_name, step_expr); 8583 else 8584 { 8585 /* Build: 8586 [base, base, base, ...] 8587 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */ 8588 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))); 8589 gcc_assert (flag_associative_math); 8590 tree index = build_index_vector (step_vectype, 0, 1); 8591 tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype, 8592 new_name); 8593 tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype, 8594 step_expr); 8595 vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index); 8596 vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype, 8597 vec_init, step_vec); 8598 vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype, 8599 vec_init, base_vec); 8600 } 8601 vec_init = gimple_convert (&stmts, vectype, vec_init); 8602 8603 if (stmts) 8604 { 8605 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); 8606 gcc_assert (!new_bb); 8607 } 8608 } 8609 8610 8611 /* Create the vector that holds the step of the induction. */ 8612 if (nested_in_vect_loop) 8613 /* iv_loop is nested in the loop to be vectorized. Generate: 8614 vec_step = [S, S, S, S] */ 8615 new_name = step_expr; 8616 else 8617 { 8618 /* iv_loop is the loop to be vectorized. Generate: 8619 vec_step = [VF*S, VF*S, VF*S, VF*S] */ 8620 gimple_seq seq = NULL; 8621 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))) 8622 { 8623 expr = build_int_cst (integer_type_node, vf); 8624 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr); 8625 } 8626 else 8627 expr = build_int_cst (TREE_TYPE (step_expr), vf); 8628 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr), 8629 expr, step_expr); 8630 if (seq) 8631 { 8632 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq); 8633 gcc_assert (!new_bb); 8634 } 8635 } 8636 8637 t = unshare_expr (new_name); 8638 gcc_assert (CONSTANT_CLASS_P (new_name) 8639 || TREE_CODE (new_name) == SSA_NAME); 8640 new_vec = build_vector_from_val (step_vectype, t); 8641 vec_step = vect_init_vector (loop_vinfo, stmt_info, 8642 new_vec, step_vectype, NULL); 8643 8644 8645 /* Create the following def-use cycle: 8646 loop prolog: 8647 vec_init = ... 8648 vec_step = ... 8649 loop: 8650 vec_iv = PHI <vec_init, vec_loop> 8651 ... 8652 STMT 8653 ... 8654 vec_loop = vec_iv + vec_step; */ 8655 8656 /* Create the induction-phi that defines the induction-operand. */ 8657 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_"); 8658 induction_phi = create_phi_node (vec_dest, iv_loop->header); 8659 induc_def = PHI_RESULT (induction_phi); 8660 8661 /* Create the iv update inside the loop */ 8662 stmts = NULL; 8663 vec_def = gimple_convert (&stmts, step_vectype, induc_def); 8664 vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step); 8665 vec_def = gimple_convert (&stmts, vectype, vec_def); 8666 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT); 8667 new_stmt = SSA_NAME_DEF_STMT (vec_def); 8668 8669 /* Set the arguments of the phi node: */ 8670 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION); 8671 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop), 8672 UNKNOWN_LOCATION); 8673 8674 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi); 8675 *vec_stmt = induction_phi; 8676 8677 /* In case that vectorization factor (VF) is bigger than the number 8678 of elements that we can fit in a vectype (nunits), we have to generate 8679 more than one vector stmt - i.e - we need to "unroll" the 8680 vector stmt by a factor VF/nunits. For more details see documentation 8681 in vectorizable_operation. */ 8682 8683 if (ncopies > 1) 8684 { 8685 gimple_seq seq = NULL; 8686 /* FORNOW. This restriction should be relaxed. */ 8687 gcc_assert (!nested_in_vect_loop); 8688 8689 /* Create the vector that holds the step of the induction. */ 8690 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))) 8691 { 8692 expr = build_int_cst (integer_type_node, nunits); 8693 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr); 8694 } 8695 else 8696 expr = build_int_cst (TREE_TYPE (step_expr), nunits); 8697 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr), 8698 expr, step_expr); 8699 if (seq) 8700 { 8701 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq); 8702 gcc_assert (!new_bb); 8703 } 8704 8705 t = unshare_expr (new_name); 8706 gcc_assert (CONSTANT_CLASS_P (new_name) 8707 || TREE_CODE (new_name) == SSA_NAME); 8708 new_vec = build_vector_from_val (step_vectype, t); 8709 vec_step = vect_init_vector (loop_vinfo, stmt_info, 8710 new_vec, step_vectype, NULL); 8711 8712 vec_def = induc_def; 8713 for (i = 1; i < ncopies; i++) 8714 { 8715 /* vec_i = vec_prev + vec_step */ 8716 gimple_seq stmts = NULL; 8717 vec_def = gimple_convert (&stmts, step_vectype, vec_def); 8718 vec_def = gimple_build (&stmts, 8719 PLUS_EXPR, step_vectype, vec_def, vec_step); 8720 vec_def = gimple_convert (&stmts, vectype, vec_def); 8721 8722 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT); 8723 new_stmt = SSA_NAME_DEF_STMT (vec_def); 8724 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt); 8725 } 8726 } 8727 8728 if (dump_enabled_p ()) 8729 dump_printf_loc (MSG_NOTE, vect_location, 8730 "transform induction: created def-use cycle: %G%G", 8731 induction_phi, SSA_NAME_DEF_STMT (vec_def)); 8732 8733 return true; 8734 } 8735 8736 /* Function vectorizable_live_operation. 8737 8738 STMT_INFO computes a value that is used outside the loop. Check if 8739 it can be supported. */ 8740 8741 bool 8742 vectorizable_live_operation (vec_info *vinfo, 8743 stmt_vec_info stmt_info, 8744 gimple_stmt_iterator *gsi, 8745 slp_tree slp_node, slp_instance slp_node_instance, 8746 int slp_index, bool vec_stmt_p, 8747 stmt_vector_for_cost *cost_vec) 8748 { 8749 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo); 8750 imm_use_iterator imm_iter; 8751 tree lhs, lhs_type, bitsize; 8752 tree vectype = (slp_node 8753 ? SLP_TREE_VECTYPE (slp_node) 8754 : STMT_VINFO_VECTYPE (stmt_info)); 8755 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype); 8756 int ncopies; 8757 gimple *use_stmt; 8758 auto_vec<tree> vec_oprnds; 8759 int vec_entry = 0; 8760 poly_uint64 vec_index = 0; 8761 8762 gcc_assert (STMT_VINFO_LIVE_P (stmt_info)); 8763 8764 /* If a stmt of a reduction is live, vectorize it via 8765 vect_create_epilog_for_reduction. vectorizable_reduction assessed 8766 validity so just trigger the transform here. */ 8767 if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))) 8768 { 8769 if (!vec_stmt_p) 8770 return true; 8771 if (slp_node) 8772 { 8773 /* For reduction chains the meta-info is attached to 8774 the group leader. */ 8775 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)) 8776 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info); 8777 /* For SLP reductions we vectorize the epilogue for 8778 all involved stmts together. */ 8779 else if (slp_index != 0) 8780 return true; 8781 } 8782 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info); 8783 gcc_assert (reduc_info->is_reduc_info); 8784 if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION 8785 || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION) 8786 return true; 8787 vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node, 8788 slp_node_instance); 8789 return true; 8790 } 8791 8792 /* If STMT is not relevant and it is a simple assignment and its inputs are 8793 invariant then it can remain in place, unvectorized. The original last 8794 scalar value that it computes will be used. */ 8795 if (!STMT_VINFO_RELEVANT_P (stmt_info)) 8796 { 8797 gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo)); 8798 if (dump_enabled_p ()) 8799 dump_printf_loc (MSG_NOTE, vect_location, 8800 "statement is simple and uses invariant. Leaving in " 8801 "place.\n"); 8802 return true; 8803 } 8804 8805 if (slp_node) 8806 ncopies = 1; 8807 else 8808 ncopies = vect_get_num_copies (loop_vinfo, vectype); 8809 8810 if (slp_node) 8811 { 8812 gcc_assert (slp_index >= 0); 8813 8814 /* Get the last occurrence of the scalar index from the concatenation of 8815 all the slp vectors. Calculate which slp vector it is and the index 8816 within. */ 8817 int num_scalar = SLP_TREE_LANES (slp_node); 8818 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); 8819 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index; 8820 8821 /* Calculate which vector contains the result, and which lane of 8822 that vector we need. */ 8823 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index)) 8824 { 8825 if (dump_enabled_p ()) 8826 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 8827 "Cannot determine which vector holds the" 8828 " final result.\n"); 8829 return false; 8830 } 8831 } 8832 8833 if (!vec_stmt_p) 8834 { 8835 /* No transformation required. */ 8836 if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)) 8837 { 8838 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype, 8839 OPTIMIZE_FOR_SPEED)) 8840 { 8841 if (dump_enabled_p ()) 8842 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 8843 "can't operate on partial vectors " 8844 "because the target doesn't support extract " 8845 "last reduction.\n"); 8846 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; 8847 } 8848 else if (slp_node) 8849 { 8850 if (dump_enabled_p ()) 8851 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 8852 "can't operate on partial vectors " 8853 "because an SLP statement is live after " 8854 "the loop.\n"); 8855 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; 8856 } 8857 else if (ncopies > 1) 8858 { 8859 if (dump_enabled_p ()) 8860 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 8861 "can't operate on partial vectors " 8862 "because ncopies is greater than 1.\n"); 8863 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; 8864 } 8865 else 8866 { 8867 gcc_assert (ncopies == 1 && !slp_node); 8868 vect_record_loop_mask (loop_vinfo, 8869 &LOOP_VINFO_MASKS (loop_vinfo), 8870 1, vectype, NULL); 8871 } 8872 } 8873 /* ??? Enable for loop costing as well. */ 8874 if (!loop_vinfo) 8875 record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE, 8876 0, vect_epilogue); 8877 return true; 8878 } 8879 8880 /* Use the lhs of the original scalar statement. */ 8881 gimple *stmt = vect_orig_stmt (stmt_info)->stmt; 8882 if (dump_enabled_p ()) 8883 dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live " 8884 "stmt %G", stmt); 8885 8886 lhs = gimple_get_lhs (stmt); 8887 lhs_type = TREE_TYPE (lhs); 8888 8889 bitsize = vector_element_bits_tree (vectype); 8890 8891 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */ 8892 tree vec_lhs, bitstart; 8893 gimple *vec_stmt; 8894 if (slp_node) 8895 { 8896 gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)); 8897 8898 /* Get the correct slp vectorized stmt. */ 8899 vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry]; 8900 vec_lhs = gimple_get_lhs (vec_stmt); 8901 8902 /* Get entry to use. */ 8903 bitstart = bitsize_int (vec_index); 8904 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart); 8905 } 8906 else 8907 { 8908 /* For multiple copies, get the last copy. */ 8909 vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last (); 8910 vec_lhs = gimple_get_lhs (vec_stmt); 8911 8912 /* Get the last lane in the vector. */ 8913 bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1)); 8914 } 8915 8916 if (loop_vinfo) 8917 { 8918 /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI 8919 requirement, insert one phi node for it. It looks like: 8920 loop; 8921 BB: 8922 # lhs' = PHI <lhs> 8923 ==> 8924 loop; 8925 BB: 8926 # vec_lhs' = PHI <vec_lhs> 8927 new_tree = lane_extract <vec_lhs', ...>; 8928 lhs' = new_tree; */ 8929 8930 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 8931 basic_block exit_bb = single_exit (loop)->dest; 8932 gcc_assert (single_pred_p (exit_bb)); 8933 8934 tree vec_lhs_phi = copy_ssa_name (vec_lhs); 8935 gimple *phi = create_phi_node (vec_lhs_phi, exit_bb); 8936 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs); 8937 8938 gimple_seq stmts = NULL; 8939 tree new_tree; 8940 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) 8941 { 8942 /* Emit: 8943 8944 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK> 8945 8946 where VEC_LHS is the vectorized live-out result and MASK is 8947 the loop mask for the final iteration. */ 8948 gcc_assert (ncopies == 1 && !slp_node); 8949 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info)); 8950 tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo), 8951 1, vectype, 0); 8952 tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type, 8953 mask, vec_lhs_phi); 8954 8955 /* Convert the extracted vector element to the scalar type. */ 8956 new_tree = gimple_convert (&stmts, lhs_type, scalar_res); 8957 } 8958 else 8959 { 8960 tree bftype = TREE_TYPE (vectype); 8961 if (VECTOR_BOOLEAN_TYPE_P (vectype)) 8962 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1); 8963 new_tree = build3 (BIT_FIELD_REF, bftype, 8964 vec_lhs_phi, bitsize, bitstart); 8965 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree), 8966 &stmts, true, NULL_TREE); 8967 } 8968 8969 if (stmts) 8970 { 8971 gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb); 8972 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); 8973 8974 /* Remove existing phi from lhs and create one copy from new_tree. */ 8975 tree lhs_phi = NULL_TREE; 8976 gimple_stmt_iterator gsi; 8977 for (gsi = gsi_start_phis (exit_bb); 8978 !gsi_end_p (gsi); gsi_next (&gsi)) 8979 { 8980 gimple *phi = gsi_stmt (gsi); 8981 if ((gimple_phi_arg_def (phi, 0) == lhs)) 8982 { 8983 remove_phi_node (&gsi, false); 8984 lhs_phi = gimple_phi_result (phi); 8985 gimple *copy = gimple_build_assign (lhs_phi, new_tree); 8986 gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT); 8987 break; 8988 } 8989 } 8990 } 8991 8992 /* Replace use of lhs with newly computed result. If the use stmt is a 8993 single arg PHI, just replace all uses of PHI result. It's necessary 8994 because lcssa PHI defining lhs may be before newly inserted stmt. */ 8995 use_operand_p use_p; 8996 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs) 8997 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)) 8998 && !is_gimple_debug (use_stmt)) 8999 { 9000 if (gimple_code (use_stmt) == GIMPLE_PHI 9001 && gimple_phi_num_args (use_stmt) == 1) 9002 { 9003 replace_uses_by (gimple_phi_result (use_stmt), new_tree); 9004 } 9005 else 9006 { 9007 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter) 9008 SET_USE (use_p, new_tree); 9009 } 9010 update_stmt (use_stmt); 9011 } 9012 } 9013 else 9014 { 9015 /* For basic-block vectorization simply insert the lane-extraction. */ 9016 tree bftype = TREE_TYPE (vectype); 9017 if (VECTOR_BOOLEAN_TYPE_P (vectype)) 9018 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1); 9019 tree new_tree = build3 (BIT_FIELD_REF, bftype, 9020 vec_lhs, bitsize, bitstart); 9021 gimple_seq stmts = NULL; 9022 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree), 9023 &stmts, true, NULL_TREE); 9024 if (TREE_CODE (new_tree) == SSA_NAME 9025 && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs)) 9026 SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1; 9027 if (is_a <gphi *> (vec_stmt)) 9028 { 9029 gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt)); 9030 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT); 9031 } 9032 else 9033 { 9034 gimple_stmt_iterator si = gsi_for_stmt (vec_stmt); 9035 gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT); 9036 } 9037 9038 /* Replace use of lhs with newly computed result. If the use stmt is a 9039 single arg PHI, just replace all uses of PHI result. It's necessary 9040 because lcssa PHI defining lhs may be before newly inserted stmt. */ 9041 use_operand_p use_p; 9042 stmt_vec_info use_stmt_info; 9043 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs) 9044 if (!is_gimple_debug (use_stmt) 9045 && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt)) 9046 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))) 9047 { 9048 /* ??? This can happen when the live lane ends up being 9049 used in a vector construction code-generated by an 9050 external SLP node (and code-generation for that already 9051 happened). See gcc.dg/vect/bb-slp-47.c. 9052 Doing this is what would happen if that vector CTOR 9053 were not code-generated yet so it is not too bad. 9054 ??? In fact we'd likely want to avoid this situation 9055 in the first place. */ 9056 if (TREE_CODE (new_tree) == SSA_NAME 9057 && !SSA_NAME_IS_DEFAULT_DEF (new_tree) 9058 && gimple_code (use_stmt) != GIMPLE_PHI 9059 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree), 9060 use_stmt)) 9061 { 9062 enum tree_code code = gimple_assign_rhs_code (use_stmt); 9063 gcc_checking_assert (code == SSA_NAME 9064 || code == CONSTRUCTOR 9065 || code == VIEW_CONVERT_EXPR 9066 || CONVERT_EXPR_CODE_P (code)); 9067 if (dump_enabled_p ()) 9068 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 9069 "Using original scalar computation for " 9070 "live lane because use preceeds vector " 9071 "def\n"); 9072 continue; 9073 } 9074 /* ??? It can also happen that we end up pulling a def into 9075 a loop where replacing out-of-loop uses would require 9076 a new LC SSA PHI node. Retain the original scalar in 9077 those cases as well. PR98064. */ 9078 if (TREE_CODE (new_tree) == SSA_NAME 9079 && !SSA_NAME_IS_DEFAULT_DEF (new_tree) 9080 && (gimple_bb (use_stmt)->loop_father 9081 != gimple_bb (vec_stmt)->loop_father) 9082 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father, 9083 gimple_bb (use_stmt)->loop_father)) 9084 { 9085 if (dump_enabled_p ()) 9086 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 9087 "Using original scalar computation for " 9088 "live lane because there is an out-of-loop " 9089 "definition for it\n"); 9090 continue; 9091 } 9092 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter) 9093 SET_USE (use_p, new_tree); 9094 update_stmt (use_stmt); 9095 } 9096 } 9097 9098 return true; 9099 } 9100 9101 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */ 9102 9103 static void 9104 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info) 9105 { 9106 ssa_op_iter op_iter; 9107 imm_use_iterator imm_iter; 9108 def_operand_p def_p; 9109 gimple *ustmt; 9110 9111 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF) 9112 { 9113 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p)) 9114 { 9115 basic_block bb; 9116 9117 if (!is_gimple_debug (ustmt)) 9118 continue; 9119 9120 bb = gimple_bb (ustmt); 9121 9122 if (!flow_bb_inside_loop_p (loop, bb)) 9123 { 9124 if (gimple_debug_bind_p (ustmt)) 9125 { 9126 if (dump_enabled_p ()) 9127 dump_printf_loc (MSG_NOTE, vect_location, 9128 "killing debug use\n"); 9129 9130 gimple_debug_bind_reset_value (ustmt); 9131 update_stmt (ustmt); 9132 } 9133 else 9134 gcc_unreachable (); 9135 } 9136 } 9137 } 9138 } 9139 9140 /* Given loop represented by LOOP_VINFO, return true if computation of 9141 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false 9142 otherwise. */ 9143 9144 static bool 9145 loop_niters_no_overflow (loop_vec_info loop_vinfo) 9146 { 9147 /* Constant case. */ 9148 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) 9149 { 9150 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo); 9151 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo); 9152 9153 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST); 9154 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST); 9155 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters)) 9156 return true; 9157 } 9158 9159 widest_int max; 9160 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 9161 /* Check the upper bound of loop niters. */ 9162 if (get_max_loop_iterations (loop, &max)) 9163 { 9164 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)); 9165 signop sgn = TYPE_SIGN (type); 9166 widest_int type_max = widest_int::from (wi::max_value (type), sgn); 9167 if (max < type_max) 9168 return true; 9169 } 9170 return false; 9171 } 9172 9173 /* Return a mask type with half the number of elements as OLD_TYPE, 9174 given that it should have mode NEW_MODE. */ 9175 9176 tree 9177 vect_halve_mask_nunits (tree old_type, machine_mode new_mode) 9178 { 9179 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2); 9180 return build_truth_vector_type_for_mode (nunits, new_mode); 9181 } 9182 9183 /* Return a mask type with twice as many elements as OLD_TYPE, 9184 given that it should have mode NEW_MODE. */ 9185 9186 tree 9187 vect_double_mask_nunits (tree old_type, machine_mode new_mode) 9188 { 9189 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2; 9190 return build_truth_vector_type_for_mode (nunits, new_mode); 9191 } 9192 9193 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to 9194 contain a sequence of NVECTORS masks that each control a vector of type 9195 VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND 9196 these vector masks with the vector version of SCALAR_MASK. */ 9197 9198 void 9199 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks, 9200 unsigned int nvectors, tree vectype, tree scalar_mask) 9201 { 9202 gcc_assert (nvectors != 0); 9203 if (masks->length () < nvectors) 9204 masks->safe_grow_cleared (nvectors, true); 9205 rgroup_controls *rgm = &(*masks)[nvectors - 1]; 9206 /* The number of scalars per iteration and the number of vectors are 9207 both compile-time constants. */ 9208 unsigned int nscalars_per_iter 9209 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype), 9210 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant (); 9211 9212 if (scalar_mask) 9213 { 9214 scalar_cond_masked_key cond (scalar_mask, nvectors); 9215 loop_vinfo->scalar_cond_masked_set.add (cond); 9216 } 9217 9218 if (rgm->max_nscalars_per_iter < nscalars_per_iter) 9219 { 9220 rgm->max_nscalars_per_iter = nscalars_per_iter; 9221 rgm->type = truth_type_for (vectype); 9222 rgm->factor = 1; 9223 } 9224 } 9225 9226 /* Given a complete set of masks MASKS, extract mask number INDEX 9227 for an rgroup that operates on NVECTORS vectors of type VECTYPE, 9228 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI. 9229 9230 See the comment above vec_loop_masks for more details about the mask 9231 arrangement. */ 9232 9233 tree 9234 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks, 9235 unsigned int nvectors, tree vectype, unsigned int index) 9236 { 9237 rgroup_controls *rgm = &(*masks)[nvectors - 1]; 9238 tree mask_type = rgm->type; 9239 9240 /* Populate the rgroup's mask array, if this is the first time we've 9241 used it. */ 9242 if (rgm->controls.is_empty ()) 9243 { 9244 rgm->controls.safe_grow_cleared (nvectors, true); 9245 for (unsigned int i = 0; i < nvectors; ++i) 9246 { 9247 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask"); 9248 /* Provide a dummy definition until the real one is available. */ 9249 SSA_NAME_DEF_STMT (mask) = gimple_build_nop (); 9250 rgm->controls[i] = mask; 9251 } 9252 } 9253 9254 tree mask = rgm->controls[index]; 9255 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type), 9256 TYPE_VECTOR_SUBPARTS (vectype))) 9257 { 9258 /* A loop mask for data type X can be reused for data type Y 9259 if X has N times more elements than Y and if Y's elements 9260 are N times bigger than X's. In this case each sequence 9261 of N elements in the loop mask will be all-zero or all-one. 9262 We can then view-convert the mask so that each sequence of 9263 N elements is replaced by a single element. */ 9264 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type), 9265 TYPE_VECTOR_SUBPARTS (vectype))); 9266 gimple_seq seq = NULL; 9267 mask_type = truth_type_for (vectype); 9268 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask); 9269 if (seq) 9270 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT); 9271 } 9272 return mask; 9273 } 9274 9275 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS 9276 lengths for controlling an operation on VECTYPE. The operation splits 9277 each element of VECTYPE into FACTOR separate subelements, measuring the 9278 length as a number of these subelements. */ 9279 9280 void 9281 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens, 9282 unsigned int nvectors, tree vectype, unsigned int factor) 9283 { 9284 gcc_assert (nvectors != 0); 9285 if (lens->length () < nvectors) 9286 lens->safe_grow_cleared (nvectors, true); 9287 rgroup_controls *rgl = &(*lens)[nvectors - 1]; 9288 9289 /* The number of scalars per iteration, scalar occupied bytes and 9290 the number of vectors are both compile-time constants. */ 9291 unsigned int nscalars_per_iter 9292 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype), 9293 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant (); 9294 9295 if (rgl->max_nscalars_per_iter < nscalars_per_iter) 9296 { 9297 /* For now, we only support cases in which all loads and stores fall back 9298 to VnQI or none do. */ 9299 gcc_assert (!rgl->max_nscalars_per_iter 9300 || (rgl->factor == 1 && factor == 1) 9301 || (rgl->max_nscalars_per_iter * rgl->factor 9302 == nscalars_per_iter * factor)); 9303 rgl->max_nscalars_per_iter = nscalars_per_iter; 9304 rgl->type = vectype; 9305 rgl->factor = factor; 9306 } 9307 } 9308 9309 /* Given a complete set of length LENS, extract length number INDEX for an 9310 rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS. */ 9311 9312 tree 9313 vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens, 9314 unsigned int nvectors, unsigned int index) 9315 { 9316 rgroup_controls *rgl = &(*lens)[nvectors - 1]; 9317 bool use_bias_adjusted_len = 9318 LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0; 9319 9320 /* Populate the rgroup's len array, if this is the first time we've 9321 used it. */ 9322 if (rgl->controls.is_empty ()) 9323 { 9324 rgl->controls.safe_grow_cleared (nvectors, true); 9325 for (unsigned int i = 0; i < nvectors; ++i) 9326 { 9327 tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo); 9328 gcc_assert (len_type != NULL_TREE); 9329 9330 tree len = make_temp_ssa_name (len_type, NULL, "loop_len"); 9331 9332 /* Provide a dummy definition until the real one is available. */ 9333 SSA_NAME_DEF_STMT (len) = gimple_build_nop (); 9334 rgl->controls[i] = len; 9335 9336 if (use_bias_adjusted_len) 9337 { 9338 gcc_assert (i == 0); 9339 tree adjusted_len = 9340 make_temp_ssa_name (len_type, NULL, "adjusted_loop_len"); 9341 SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop (); 9342 rgl->bias_adjusted_ctrl = adjusted_len; 9343 } 9344 } 9345 } 9346 9347 if (use_bias_adjusted_len) 9348 return rgl->bias_adjusted_ctrl; 9349 else 9350 return rgl->controls[index]; 9351 } 9352 9353 /* Scale profiling counters by estimation for LOOP which is vectorized 9354 by factor VF. */ 9355 9356 static void 9357 scale_profile_for_vect_loop (class loop *loop, unsigned vf) 9358 { 9359 edge preheader = loop_preheader_edge (loop); 9360 /* Reduce loop iterations by the vectorization factor. */ 9361 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf); 9362 profile_count freq_h = loop->header->count, freq_e = preheader->count (); 9363 9364 if (freq_h.nonzero_p ()) 9365 { 9366 profile_probability p; 9367 9368 /* Avoid dropping loop body profile counter to 0 because of zero count 9369 in loop's preheader. */ 9370 if (!(freq_e == profile_count::zero ())) 9371 freq_e = freq_e.force_nonzero (); 9372 p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h); 9373 scale_loop_frequencies (loop, p); 9374 } 9375 9376 edge exit_e = single_exit (loop); 9377 exit_e->probability = profile_probability::always () 9378 .apply_scale (1, new_est_niter + 1); 9379 9380 edge exit_l = single_pred_edge (loop->latch); 9381 profile_probability prob = exit_l->probability; 9382 exit_l->probability = exit_e->probability.invert (); 9383 if (prob.initialized_p () && exit_l->probability.initialized_p ()) 9384 scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob); 9385 } 9386 9387 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI 9388 latch edge values originally defined by it. */ 9389 9390 static void 9391 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo, 9392 stmt_vec_info def_stmt_info) 9393 { 9394 tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt); 9395 if (!def || TREE_CODE (def) != SSA_NAME) 9396 return; 9397 stmt_vec_info phi_info; 9398 imm_use_iterator iter; 9399 use_operand_p use_p; 9400 FOR_EACH_IMM_USE_FAST (use_p, iter, def) 9401 if (gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p))) 9402 if (gimple_bb (phi)->loop_father->header == gimple_bb (phi) 9403 && (phi_info = loop_vinfo->lookup_stmt (phi)) 9404 && STMT_VINFO_RELEVANT_P (phi_info) 9405 && VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info)) 9406 && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION 9407 && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION) 9408 { 9409 loop_p loop = gimple_bb (phi)->loop_father; 9410 edge e = loop_latch_edge (loop); 9411 if (PHI_ARG_DEF_FROM_EDGE (phi, e) == def) 9412 { 9413 vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info); 9414 vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info); 9415 gcc_assert (phi_defs.length () == latch_defs.length ()); 9416 for (unsigned i = 0; i < phi_defs.length (); ++i) 9417 add_phi_arg (as_a <gphi *> (phi_defs[i]), 9418 gimple_get_lhs (latch_defs[i]), e, 9419 gimple_phi_arg_location (phi, e->dest_idx)); 9420 } 9421 } 9422 } 9423 9424 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI. 9425 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its 9426 stmt_vec_info. */ 9427 9428 static bool 9429 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info, 9430 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store) 9431 { 9432 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 9433 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 9434 9435 if (dump_enabled_p ()) 9436 dump_printf_loc (MSG_NOTE, vect_location, 9437 "------>vectorizing statement: %G", stmt_info->stmt); 9438 9439 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info)) 9440 vect_loop_kill_debug_uses (loop, stmt_info); 9441 9442 if (!STMT_VINFO_RELEVANT_P (stmt_info) 9443 && !STMT_VINFO_LIVE_P (stmt_info)) 9444 return false; 9445 9446 if (STMT_VINFO_VECTYPE (stmt_info)) 9447 { 9448 poly_uint64 nunits 9449 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)); 9450 if (!STMT_SLP_TYPE (stmt_info) 9451 && maybe_ne (nunits, vf) 9452 && dump_enabled_p ()) 9453 /* For SLP VF is set according to unrolling factor, and not 9454 to vector size, hence for SLP this print is not valid. */ 9455 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n"); 9456 } 9457 9458 /* Pure SLP statements have already been vectorized. We still need 9459 to apply loop vectorization to hybrid SLP statements. */ 9460 if (PURE_SLP_STMT (stmt_info)) 9461 return false; 9462 9463 if (dump_enabled_p ()) 9464 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n"); 9465 9466 if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL)) 9467 *seen_store = stmt_info; 9468 9469 return true; 9470 } 9471 9472 /* Helper function to pass to simplify_replace_tree to enable replacing tree's 9473 in the hash_map with its corresponding values. */ 9474 9475 static tree 9476 find_in_mapping (tree t, void *context) 9477 { 9478 hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context; 9479 9480 tree *value = mapping->get (t); 9481 return value ? *value : t; 9482 } 9483 9484 /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the 9485 original loop that has now been vectorized. 9486 9487 The inits of the data_references need to be advanced with the number of 9488 iterations of the main loop. This has been computed in vect_do_peeling and 9489 is stored in parameter ADVANCE. We first restore the data_references 9490 initial offset with the values recored in ORIG_DRS_INIT. 9491 9492 Since the loop_vec_info of this EPILOGUE was constructed for the original 9493 loop, its stmt_vec_infos all point to the original statements. These need 9494 to be updated to point to their corresponding copies as well as the SSA_NAMES 9495 in their PATTERN_DEF_SEQs and RELATED_STMTs. 9496 9497 The data_reference's connections also need to be updated. Their 9498 corresponding dr_vec_info need to be reconnected to the EPILOGUE's 9499 stmt_vec_infos, their statements need to point to their corresponding copy, 9500 if they are gather loads or scatter stores then their reference needs to be 9501 updated to point to its corresponding copy and finally we set 9502 'base_misaligned' to false as we have already peeled for alignment in the 9503 prologue of the main loop. */ 9504 9505 static void 9506 update_epilogue_loop_vinfo (class loop *epilogue, tree advance) 9507 { 9508 loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue); 9509 auto_vec<gimple *> stmt_worklist; 9510 hash_map<tree,tree> mapping; 9511 gimple *orig_stmt, *new_stmt; 9512 gimple_stmt_iterator epilogue_gsi; 9513 gphi_iterator epilogue_phi_gsi; 9514 stmt_vec_info stmt_vinfo = NULL, related_vinfo; 9515 basic_block *epilogue_bbs = get_loop_body (epilogue); 9516 unsigned i; 9517 9518 free (LOOP_VINFO_BBS (epilogue_vinfo)); 9519 LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs; 9520 9521 /* Advance data_reference's with the number of iterations of the previous 9522 loop and its prologue. */ 9523 vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR); 9524 9525 9526 /* The EPILOGUE loop is a copy of the original loop so they share the same 9527 gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to 9528 point to the copied statements. We also create a mapping of all LHS' in 9529 the original loop and all the LHS' in the EPILOGUE and create worklists to 9530 update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs. */ 9531 for (unsigned i = 0; i < epilogue->num_nodes; ++i) 9532 { 9533 for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]); 9534 !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi)) 9535 { 9536 new_stmt = epilogue_phi_gsi.phi (); 9537 9538 gcc_assert (gimple_uid (new_stmt) > 0); 9539 stmt_vinfo 9540 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1]; 9541 9542 orig_stmt = STMT_VINFO_STMT (stmt_vinfo); 9543 STMT_VINFO_STMT (stmt_vinfo) = new_stmt; 9544 9545 mapping.put (gimple_phi_result (orig_stmt), 9546 gimple_phi_result (new_stmt)); 9547 /* PHI nodes can not have patterns or related statements. */ 9548 gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL 9549 && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL); 9550 } 9551 9552 for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]); 9553 !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi)) 9554 { 9555 new_stmt = gsi_stmt (epilogue_gsi); 9556 if (is_gimple_debug (new_stmt)) 9557 continue; 9558 9559 gcc_assert (gimple_uid (new_stmt) > 0); 9560 stmt_vinfo 9561 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1]; 9562 9563 orig_stmt = STMT_VINFO_STMT (stmt_vinfo); 9564 STMT_VINFO_STMT (stmt_vinfo) = new_stmt; 9565 9566 if (tree old_lhs = gimple_get_lhs (orig_stmt)) 9567 mapping.put (old_lhs, gimple_get_lhs (new_stmt)); 9568 9569 if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo)) 9570 { 9571 gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo); 9572 for (gimple_stmt_iterator gsi = gsi_start (seq); 9573 !gsi_end_p (gsi); gsi_next (&gsi)) 9574 stmt_worklist.safe_push (gsi_stmt (gsi)); 9575 } 9576 9577 related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo); 9578 if (related_vinfo != NULL && related_vinfo != stmt_vinfo) 9579 { 9580 gimple *stmt = STMT_VINFO_STMT (related_vinfo); 9581 stmt_worklist.safe_push (stmt); 9582 /* Set BB such that the assert in 9583 'get_initial_def_for_reduction' is able to determine that 9584 the BB of the related stmt is inside this loop. */ 9585 gimple_set_bb (stmt, 9586 gimple_bb (new_stmt)); 9587 related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo); 9588 gcc_assert (related_vinfo == NULL 9589 || related_vinfo == stmt_vinfo); 9590 } 9591 } 9592 } 9593 9594 /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed 9595 using the original main loop and thus need to be updated to refer to the 9596 cloned variables used in the epilogue. */ 9597 for (unsigned i = 0; i < stmt_worklist.length (); ++i) 9598 { 9599 gimple *stmt = stmt_worklist[i]; 9600 tree *new_op; 9601 9602 for (unsigned j = 1; j < gimple_num_ops (stmt); ++j) 9603 { 9604 tree op = gimple_op (stmt, j); 9605 if ((new_op = mapping.get(op))) 9606 gimple_set_op (stmt, j, *new_op); 9607 else 9608 { 9609 /* PR92429: The last argument of simplify_replace_tree disables 9610 folding when replacing arguments. This is required as 9611 otherwise you might end up with different statements than the 9612 ones analyzed in vect_loop_analyze, leading to different 9613 vectorization. */ 9614 op = simplify_replace_tree (op, NULL_TREE, NULL_TREE, 9615 &find_in_mapping, &mapping, false); 9616 gimple_set_op (stmt, j, op); 9617 } 9618 } 9619 } 9620 9621 struct data_reference *dr; 9622 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo); 9623 FOR_EACH_VEC_ELT (datarefs, i, dr) 9624 { 9625 orig_stmt = DR_STMT (dr); 9626 gcc_assert (gimple_uid (orig_stmt) > 0); 9627 stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1]; 9628 /* Data references for gather loads and scatter stores do not use the 9629 updated offset we set using ADVANCE. Instead we have to make sure the 9630 reference in the data references point to the corresponding copy of 9631 the original in the epilogue. */ 9632 if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo)) 9633 == VMAT_GATHER_SCATTER) 9634 { 9635 DR_REF (dr) 9636 = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE, 9637 &find_in_mapping, &mapping); 9638 DR_BASE_ADDRESS (dr) 9639 = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE, 9640 &find_in_mapping, &mapping); 9641 } 9642 DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo); 9643 stmt_vinfo->dr_aux.stmt = stmt_vinfo; 9644 /* The vector size of the epilogue is smaller than that of the main loop 9645 so the alignment is either the same or lower. This means the dr will 9646 thus by definition be aligned. */ 9647 STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false; 9648 } 9649 9650 epilogue_vinfo->shared->datarefs_copy.release (); 9651 epilogue_vinfo->shared->save_datarefs (); 9652 } 9653 9654 /* Function vect_transform_loop. 9655 9656 The analysis phase has determined that the loop is vectorizable. 9657 Vectorize the loop - created vectorized stmts to replace the scalar 9658 stmts in the loop, and update the loop exit condition. 9659 Returns scalar epilogue loop if any. */ 9660 9661 class loop * 9662 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call) 9663 { 9664 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 9665 class loop *epilogue = NULL; 9666 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); 9667 int nbbs = loop->num_nodes; 9668 int i; 9669 tree niters_vector = NULL_TREE; 9670 tree step_vector = NULL_TREE; 9671 tree niters_vector_mult_vf = NULL_TREE; 9672 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 9673 unsigned int lowest_vf = constant_lower_bound (vf); 9674 gimple *stmt; 9675 bool check_profitability = false; 9676 unsigned int th; 9677 9678 DUMP_VECT_SCOPE ("vec_transform_loop"); 9679 9680 loop_vinfo->shared->check_datarefs (); 9681 9682 /* Use the more conservative vectorization threshold. If the number 9683 of iterations is constant assume the cost check has been performed 9684 by our caller. If the threshold makes all loops profitable that 9685 run at least the (estimated) vectorization factor number of times 9686 checking is pointless, too. */ 9687 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo); 9688 if (vect_apply_runtime_profitability_check_p (loop_vinfo)) 9689 { 9690 if (dump_enabled_p ()) 9691 dump_printf_loc (MSG_NOTE, vect_location, 9692 "Profitability threshold is %d loop iterations.\n", 9693 th); 9694 check_profitability = true; 9695 } 9696 9697 /* Make sure there exists a single-predecessor exit bb. Do this before 9698 versioning. */ 9699 edge e = single_exit (loop); 9700 if (! single_pred_p (e->dest)) 9701 { 9702 split_loop_exit_edge (e, true); 9703 if (dump_enabled_p ()) 9704 dump_printf (MSG_NOTE, "split exit edge\n"); 9705 } 9706 9707 /* Version the loop first, if required, so the profitability check 9708 comes first. */ 9709 9710 if (LOOP_REQUIRES_VERSIONING (loop_vinfo)) 9711 { 9712 class loop *sloop 9713 = vect_loop_versioning (loop_vinfo, loop_vectorized_call); 9714 sloop->force_vectorize = false; 9715 check_profitability = false; 9716 } 9717 9718 /* Make sure there exists a single-predecessor exit bb also on the 9719 scalar loop copy. Do this after versioning but before peeling 9720 so CFG structure is fine for both scalar and if-converted loop 9721 to make slpeel_duplicate_current_defs_from_edges face matched 9722 loop closed PHI nodes on the exit. */ 9723 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)) 9724 { 9725 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)); 9726 if (! single_pred_p (e->dest)) 9727 { 9728 split_loop_exit_edge (e, true); 9729 if (dump_enabled_p ()) 9730 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n"); 9731 } 9732 } 9733 9734 tree niters = vect_build_loop_niters (loop_vinfo); 9735 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters; 9736 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo)); 9737 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo); 9738 tree advance; 9739 drs_init_vec orig_drs_init; 9740 9741 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector, 9742 &step_vector, &niters_vector_mult_vf, th, 9743 check_profitability, niters_no_overflow, 9744 &advance); 9745 9746 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo) 9747 && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ()) 9748 scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo), 9749 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo)); 9750 9751 if (niters_vector == NULL_TREE) 9752 { 9753 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) 9754 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) 9755 && known_eq (lowest_vf, vf)) 9756 { 9757 niters_vector 9758 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)), 9759 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf); 9760 step_vector = build_one_cst (TREE_TYPE (niters)); 9761 } 9762 else if (vect_use_loop_mask_for_alignment_p (loop_vinfo)) 9763 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector, 9764 &step_vector, niters_no_overflow); 9765 else 9766 /* vect_do_peeling subtracted the number of peeled prologue 9767 iterations from LOOP_VINFO_NITERS. */ 9768 vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo), 9769 &niters_vector, &step_vector, 9770 niters_no_overflow); 9771 } 9772 9773 /* 1) Make sure the loop header has exactly two entries 9774 2) Make sure we have a preheader basic block. */ 9775 9776 gcc_assert (EDGE_COUNT (loop->header->preds) == 2); 9777 9778 split_edge (loop_preheader_edge (loop)); 9779 9780 if (vect_use_loop_mask_for_alignment_p (loop_vinfo)) 9781 /* This will deal with any possible peeling. */ 9782 vect_prepare_for_masked_peels (loop_vinfo); 9783 9784 /* Schedule the SLP instances first, then handle loop vectorization 9785 below. */ 9786 if (!loop_vinfo->slp_instances.is_empty ()) 9787 { 9788 DUMP_VECT_SCOPE ("scheduling SLP instances"); 9789 vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo)); 9790 } 9791 9792 /* FORNOW: the vectorizer supports only loops which body consist 9793 of one basic block (header + empty latch). When the vectorizer will 9794 support more involved loop forms, the order by which the BBs are 9795 traversed need to be reconsidered. */ 9796 9797 for (i = 0; i < nbbs; i++) 9798 { 9799 basic_block bb = bbs[i]; 9800 stmt_vec_info stmt_info; 9801 9802 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si); 9803 gsi_next (&si)) 9804 { 9805 gphi *phi = si.phi (); 9806 if (dump_enabled_p ()) 9807 dump_printf_loc (MSG_NOTE, vect_location, 9808 "------>vectorizing phi: %G", phi); 9809 stmt_info = loop_vinfo->lookup_stmt (phi); 9810 if (!stmt_info) 9811 continue; 9812 9813 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info)) 9814 vect_loop_kill_debug_uses (loop, stmt_info); 9815 9816 if (!STMT_VINFO_RELEVANT_P (stmt_info) 9817 && !STMT_VINFO_LIVE_P (stmt_info)) 9818 continue; 9819 9820 if (STMT_VINFO_VECTYPE (stmt_info) 9821 && (maybe_ne 9822 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf)) 9823 && dump_enabled_p ()) 9824 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n"); 9825 9826 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def 9827 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def 9828 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def 9829 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle 9830 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def) 9831 && ! PURE_SLP_STMT (stmt_info)) 9832 { 9833 if (dump_enabled_p ()) 9834 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n"); 9835 vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL); 9836 } 9837 } 9838 9839 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si); 9840 gsi_next (&si)) 9841 { 9842 gphi *phi = si.phi (); 9843 stmt_info = loop_vinfo->lookup_stmt (phi); 9844 if (!stmt_info) 9845 continue; 9846 9847 if (!STMT_VINFO_RELEVANT_P (stmt_info) 9848 && !STMT_VINFO_LIVE_P (stmt_info)) 9849 continue; 9850 9851 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def 9852 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def 9853 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def 9854 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle 9855 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def) 9856 && ! PURE_SLP_STMT (stmt_info)) 9857 maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info); 9858 } 9859 9860 for (gimple_stmt_iterator si = gsi_start_bb (bb); 9861 !gsi_end_p (si);) 9862 { 9863 stmt = gsi_stmt (si); 9864 /* During vectorization remove existing clobber stmts. */ 9865 if (gimple_clobber_p (stmt)) 9866 { 9867 unlink_stmt_vdef (stmt); 9868 gsi_remove (&si, true); 9869 release_defs (stmt); 9870 } 9871 else 9872 { 9873 /* Ignore vector stmts created in the outer loop. */ 9874 stmt_info = loop_vinfo->lookup_stmt (stmt); 9875 9876 /* vector stmts created in the outer-loop during vectorization of 9877 stmts in an inner-loop may not have a stmt_info, and do not 9878 need to be vectorized. */ 9879 stmt_vec_info seen_store = NULL; 9880 if (stmt_info) 9881 { 9882 if (STMT_VINFO_IN_PATTERN_P (stmt_info)) 9883 { 9884 gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info); 9885 for (gimple_stmt_iterator subsi = gsi_start (def_seq); 9886 !gsi_end_p (subsi); gsi_next (&subsi)) 9887 { 9888 stmt_vec_info pat_stmt_info 9889 = loop_vinfo->lookup_stmt (gsi_stmt (subsi)); 9890 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, 9891 &si, &seen_store); 9892 } 9893 stmt_vec_info pat_stmt_info 9894 = STMT_VINFO_RELATED_STMT (stmt_info); 9895 if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, 9896 &si, &seen_store)) 9897 maybe_set_vectorized_backedge_value (loop_vinfo, 9898 pat_stmt_info); 9899 } 9900 else 9901 { 9902 if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si, 9903 &seen_store)) 9904 maybe_set_vectorized_backedge_value (loop_vinfo, 9905 stmt_info); 9906 } 9907 } 9908 gsi_next (&si); 9909 if (seen_store) 9910 { 9911 if (STMT_VINFO_GROUPED_ACCESS (seen_store)) 9912 /* Interleaving. If IS_STORE is TRUE, the 9913 vectorization of the interleaving chain was 9914 completed - free all the stores in the chain. */ 9915 vect_remove_stores (loop_vinfo, 9916 DR_GROUP_FIRST_ELEMENT (seen_store)); 9917 else 9918 /* Free the attached stmt_vec_info and remove the stmt. */ 9919 loop_vinfo->remove_stmt (stmt_info); 9920 } 9921 } 9922 } 9923 9924 /* Stub out scalar statements that must not survive vectorization. 9925 Doing this here helps with grouped statements, or statements that 9926 are involved in patterns. */ 9927 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); 9928 !gsi_end_p (gsi); gsi_next (&gsi)) 9929 { 9930 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi)); 9931 if (!call || !gimple_call_internal_p (call)) 9932 continue; 9933 internal_fn ifn = gimple_call_internal_fn (call); 9934 if (ifn == IFN_MASK_LOAD) 9935 { 9936 tree lhs = gimple_get_lhs (call); 9937 if (!VECTOR_TYPE_P (TREE_TYPE (lhs))) 9938 { 9939 tree zero = build_zero_cst (TREE_TYPE (lhs)); 9940 gimple *new_stmt = gimple_build_assign (lhs, zero); 9941 gsi_replace (&gsi, new_stmt, true); 9942 } 9943 } 9944 else if (conditional_internal_fn_code (ifn) != ERROR_MARK) 9945 { 9946 tree lhs = gimple_get_lhs (call); 9947 if (!VECTOR_TYPE_P (TREE_TYPE (lhs))) 9948 { 9949 tree else_arg 9950 = gimple_call_arg (call, gimple_call_num_args (call) - 1); 9951 gimple *new_stmt = gimple_build_assign (lhs, else_arg); 9952 gsi_replace (&gsi, new_stmt, true); 9953 } 9954 } 9955 } 9956 } /* BBs in loop */ 9957 9958 /* The vectorization factor is always > 1, so if we use an IV increment of 1. 9959 a zero NITERS becomes a nonzero NITERS_VECTOR. */ 9960 if (integer_onep (step_vector)) 9961 niters_no_overflow = true; 9962 vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector, 9963 niters_vector_mult_vf, !niters_no_overflow); 9964 9965 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo); 9966 scale_profile_for_vect_loop (loop, assumed_vf); 9967 9968 /* True if the final iteration might not handle a full vector's 9969 worth of scalar iterations. */ 9970 bool final_iter_may_be_partial 9971 = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo); 9972 /* The minimum number of iterations performed by the epilogue. This 9973 is 1 when peeling for gaps because we always need a final scalar 9974 iteration. */ 9975 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0; 9976 /* +1 to convert latch counts to loop iteration counts, 9977 -min_epilogue_iters to remove iterations that cannot be performed 9978 by the vector code. */ 9979 int bias_for_lowest = 1 - min_epilogue_iters; 9980 int bias_for_assumed = bias_for_lowest; 9981 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); 9982 if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)) 9983 { 9984 /* When the amount of peeling is known at compile time, the first 9985 iteration will have exactly alignment_npeels active elements. 9986 In the worst case it will have at least one. */ 9987 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1); 9988 bias_for_lowest += lowest_vf - min_first_active; 9989 bias_for_assumed += assumed_vf - min_first_active; 9990 } 9991 /* In these calculations the "- 1" converts loop iteration counts 9992 back to latch counts. */ 9993 if (loop->any_upper_bound) 9994 { 9995 loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo); 9996 loop->nb_iterations_upper_bound 9997 = (final_iter_may_be_partial 9998 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest, 9999 lowest_vf) - 1 10000 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest, 10001 lowest_vf) - 1); 10002 if (main_vinfo 10003 /* Both peeling for alignment and peeling for gaps can end up 10004 with the scalar epilogue running for more than VF-1 iterations. */ 10005 && !main_vinfo->peeling_for_alignment 10006 && !main_vinfo->peeling_for_gaps) 10007 { 10008 unsigned int bound; 10009 poly_uint64 main_iters 10010 = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo), 10011 LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo)); 10012 main_iters 10013 = upper_bound (main_iters, 10014 LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo)); 10015 if (can_div_away_from_zero_p (main_iters, 10016 LOOP_VINFO_VECT_FACTOR (loop_vinfo), 10017 &bound)) 10018 loop->nb_iterations_upper_bound 10019 = wi::umin ((widest_int) (bound - 1), 10020 loop->nb_iterations_upper_bound); 10021 } 10022 } 10023 if (loop->any_likely_upper_bound) 10024 loop->nb_iterations_likely_upper_bound 10025 = (final_iter_may_be_partial 10026 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound 10027 + bias_for_lowest, lowest_vf) - 1 10028 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound 10029 + bias_for_lowest, lowest_vf) - 1); 10030 if (loop->any_estimate) 10031 loop->nb_iterations_estimate 10032 = (final_iter_may_be_partial 10033 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed, 10034 assumed_vf) - 1 10035 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed, 10036 assumed_vf) - 1); 10037 10038 if (dump_enabled_p ()) 10039 { 10040 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)) 10041 { 10042 dump_printf_loc (MSG_NOTE, vect_location, 10043 "LOOP VECTORIZED\n"); 10044 if (loop->inner) 10045 dump_printf_loc (MSG_NOTE, vect_location, 10046 "OUTER LOOP VECTORIZED\n"); 10047 dump_printf (MSG_NOTE, "\n"); 10048 } 10049 else 10050 dump_printf_loc (MSG_NOTE, vect_location, 10051 "LOOP EPILOGUE VECTORIZED (MODE=%s)\n", 10052 GET_MODE_NAME (loop_vinfo->vector_mode)); 10053 } 10054 10055 /* Loops vectorized with a variable factor won't benefit from 10056 unrolling/peeling. */ 10057 if (!vf.is_constant ()) 10058 { 10059 loop->unroll = 1; 10060 if (dump_enabled_p ()) 10061 dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to" 10062 " variable-length vectorization factor\n"); 10063 } 10064 /* Free SLP instances here because otherwise stmt reference counting 10065 won't work. */ 10066 slp_instance instance; 10067 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance) 10068 vect_free_slp_instance (instance); 10069 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release (); 10070 /* Clear-up safelen field since its value is invalid after vectorization 10071 since vectorized loop can have loop-carried dependencies. */ 10072 loop->safelen = 0; 10073 10074 if (epilogue) 10075 { 10076 update_epilogue_loop_vinfo (epilogue, advance); 10077 10078 epilogue->simduid = loop->simduid; 10079 epilogue->force_vectorize = loop->force_vectorize; 10080 epilogue->dont_vectorize = false; 10081 } 10082 10083 return epilogue; 10084 } 10085 10086 /* The code below is trying to perform simple optimization - revert 10087 if-conversion for masked stores, i.e. if the mask of a store is zero 10088 do not perform it and all stored value producers also if possible. 10089 For example, 10090 for (i=0; i<n; i++) 10091 if (c[i]) 10092 { 10093 p1[i] += 1; 10094 p2[i] = p3[i] +2; 10095 } 10096 this transformation will produce the following semi-hammock: 10097 10098 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 }) 10099 { 10100 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165); 10101 vect__12.22_172 = vect__11.19_170 + vect_cst__171; 10102 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172); 10103 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165); 10104 vect__19.28_184 = vect__18.25_182 + vect_cst__183; 10105 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184); 10106 } 10107 */ 10108 10109 void 10110 optimize_mask_stores (class loop *loop) 10111 { 10112 basic_block *bbs = get_loop_body (loop); 10113 unsigned nbbs = loop->num_nodes; 10114 unsigned i; 10115 basic_block bb; 10116 class loop *bb_loop; 10117 gimple_stmt_iterator gsi; 10118 gimple *stmt; 10119 auto_vec<gimple *> worklist; 10120 auto_purge_vect_location sentinel; 10121 10122 vect_location = find_loop_location (loop); 10123 /* Pick up all masked stores in loop if any. */ 10124 for (i = 0; i < nbbs; i++) 10125 { 10126 bb = bbs[i]; 10127 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); 10128 gsi_next (&gsi)) 10129 { 10130 stmt = gsi_stmt (gsi); 10131 if (gimple_call_internal_p (stmt, IFN_MASK_STORE)) 10132 worklist.safe_push (stmt); 10133 } 10134 } 10135 10136 free (bbs); 10137 if (worklist.is_empty ()) 10138 return; 10139 10140 /* Loop has masked stores. */ 10141 while (!worklist.is_empty ()) 10142 { 10143 gimple *last, *last_store; 10144 edge e, efalse; 10145 tree mask; 10146 basic_block store_bb, join_bb; 10147 gimple_stmt_iterator gsi_to; 10148 tree vdef, new_vdef; 10149 gphi *phi; 10150 tree vectype; 10151 tree zero; 10152 10153 last = worklist.pop (); 10154 mask = gimple_call_arg (last, 2); 10155 bb = gimple_bb (last); 10156 /* Create then_bb and if-then structure in CFG, then_bb belongs to 10157 the same loop as if_bb. It could be different to LOOP when two 10158 level loop-nest is vectorized and mask_store belongs to the inner 10159 one. */ 10160 e = split_block (bb, last); 10161 bb_loop = bb->loop_father; 10162 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop)); 10163 join_bb = e->dest; 10164 store_bb = create_empty_bb (bb); 10165 add_bb_to_loop (store_bb, bb_loop); 10166 e->flags = EDGE_TRUE_VALUE; 10167 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE); 10168 /* Put STORE_BB to likely part. */ 10169 efalse->probability = profile_probability::unlikely (); 10170 store_bb->count = efalse->count (); 10171 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU); 10172 if (dom_info_available_p (CDI_DOMINATORS)) 10173 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb); 10174 if (dump_enabled_p ()) 10175 dump_printf_loc (MSG_NOTE, vect_location, 10176 "Create new block %d to sink mask stores.", 10177 store_bb->index); 10178 /* Create vector comparison with boolean result. */ 10179 vectype = TREE_TYPE (mask); 10180 zero = build_zero_cst (vectype); 10181 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE); 10182 gsi = gsi_last_bb (bb); 10183 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT); 10184 /* Create new PHI node for vdef of the last masked store: 10185 .MEM_2 = VDEF <.MEM_1> 10186 will be converted to 10187 .MEM.3 = VDEF <.MEM_1> 10188 and new PHI node will be created in join bb 10189 .MEM_2 = PHI <.MEM_1, .MEM_3> 10190 */ 10191 vdef = gimple_vdef (last); 10192 new_vdef = make_ssa_name (gimple_vop (cfun), last); 10193 gimple_set_vdef (last, new_vdef); 10194 phi = create_phi_node (vdef, join_bb); 10195 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION); 10196 10197 /* Put all masked stores with the same mask to STORE_BB if possible. */ 10198 while (true) 10199 { 10200 gimple_stmt_iterator gsi_from; 10201 gimple *stmt1 = NULL; 10202 10203 /* Move masked store to STORE_BB. */ 10204 last_store = last; 10205 gsi = gsi_for_stmt (last); 10206 gsi_from = gsi; 10207 /* Shift GSI to the previous stmt for further traversal. */ 10208 gsi_prev (&gsi); 10209 gsi_to = gsi_start_bb (store_bb); 10210 gsi_move_before (&gsi_from, &gsi_to); 10211 /* Setup GSI_TO to the non-empty block start. */ 10212 gsi_to = gsi_start_bb (store_bb); 10213 if (dump_enabled_p ()) 10214 dump_printf_loc (MSG_NOTE, vect_location, 10215 "Move stmt to created bb\n%G", last); 10216 /* Move all stored value producers if possible. */ 10217 while (!gsi_end_p (gsi)) 10218 { 10219 tree lhs; 10220 imm_use_iterator imm_iter; 10221 use_operand_p use_p; 10222 bool res; 10223 10224 /* Skip debug statements. */ 10225 if (is_gimple_debug (gsi_stmt (gsi))) 10226 { 10227 gsi_prev (&gsi); 10228 continue; 10229 } 10230 stmt1 = gsi_stmt (gsi); 10231 /* Do not consider statements writing to memory or having 10232 volatile operand. */ 10233 if (gimple_vdef (stmt1) 10234 || gimple_has_volatile_ops (stmt1)) 10235 break; 10236 gsi_from = gsi; 10237 gsi_prev (&gsi); 10238 lhs = gimple_get_lhs (stmt1); 10239 if (!lhs) 10240 break; 10241 10242 /* LHS of vectorized stmt must be SSA_NAME. */ 10243 if (TREE_CODE (lhs) != SSA_NAME) 10244 break; 10245 10246 if (!VECTOR_TYPE_P (TREE_TYPE (lhs))) 10247 { 10248 /* Remove dead scalar statement. */ 10249 if (has_zero_uses (lhs)) 10250 { 10251 gsi_remove (&gsi_from, true); 10252 continue; 10253 } 10254 } 10255 10256 /* Check that LHS does not have uses outside of STORE_BB. */ 10257 res = true; 10258 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs) 10259 { 10260 gimple *use_stmt; 10261 use_stmt = USE_STMT (use_p); 10262 if (is_gimple_debug (use_stmt)) 10263 continue; 10264 if (gimple_bb (use_stmt) != store_bb) 10265 { 10266 res = false; 10267 break; 10268 } 10269 } 10270 if (!res) 10271 break; 10272 10273 if (gimple_vuse (stmt1) 10274 && gimple_vuse (stmt1) != gimple_vuse (last_store)) 10275 break; 10276 10277 /* Can move STMT1 to STORE_BB. */ 10278 if (dump_enabled_p ()) 10279 dump_printf_loc (MSG_NOTE, vect_location, 10280 "Move stmt to created bb\n%G", stmt1); 10281 gsi_move_before (&gsi_from, &gsi_to); 10282 /* Shift GSI_TO for further insertion. */ 10283 gsi_prev (&gsi_to); 10284 } 10285 /* Put other masked stores with the same mask to STORE_BB. */ 10286 if (worklist.is_empty () 10287 || gimple_call_arg (worklist.last (), 2) != mask 10288 || worklist.last () != stmt1) 10289 break; 10290 last = worklist.pop (); 10291 } 10292 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION); 10293 } 10294 } 10295 10296 /* Decide whether it is possible to use a zero-based induction variable 10297 when vectorizing LOOP_VINFO with partial vectors. If it is, return 10298 the value that the induction variable must be able to hold in order 10299 to ensure that the rgroups eventually have no active vector elements. 10300 Return -1 otherwise. */ 10301 10302 widest_int 10303 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo) 10304 { 10305 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo); 10306 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 10307 unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo); 10308 10309 /* Calculate the value that the induction variable must be able 10310 to hit in order to ensure that we end the loop with an all-false mask. 10311 This involves adding the maximum number of inactive trailing scalar 10312 iterations. */ 10313 widest_int iv_limit = -1; 10314 if (max_loop_iterations (loop, &iv_limit)) 10315 { 10316 if (niters_skip) 10317 { 10318 /* Add the maximum number of skipped iterations to the 10319 maximum iteration count. */ 10320 if (TREE_CODE (niters_skip) == INTEGER_CST) 10321 iv_limit += wi::to_widest (niters_skip); 10322 else 10323 iv_limit += max_vf - 1; 10324 } 10325 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)) 10326 /* Make a conservatively-correct assumption. */ 10327 iv_limit += max_vf - 1; 10328 10329 /* IV_LIMIT is the maximum number of latch iterations, which is also 10330 the maximum in-range IV value. Round this value down to the previous 10331 vector alignment boundary and then add an extra full iteration. */ 10332 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 10333 iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf; 10334 } 10335 return iv_limit; 10336 } 10337 10338 /* For the given rgroup_controls RGC, check whether an induction variable 10339 would ever hit a value that produces a set of all-false masks or zero 10340 lengths before wrapping around. Return true if it's possible to wrap 10341 around before hitting the desirable value, otherwise return false. */ 10342 10343 bool 10344 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc) 10345 { 10346 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo); 10347 10348 if (iv_limit == -1) 10349 return true; 10350 10351 tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo); 10352 unsigned int compare_precision = TYPE_PRECISION (compare_type); 10353 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor; 10354 10355 if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision) 10356 return true; 10357 10358 return false; 10359 } 10360