1 /* Loop Vectorization 2 Copyright (C) 2003-2019 Free Software Foundation, Inc. 3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and 4 Ira Rosen <irar@il.ibm.com> 5 6 This file is part of GCC. 7 8 GCC is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free 10 Software Foundation; either version 3, or (at your option) any later 11 version. 12 13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or 15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 16 for more details. 17 18 You should have received a copy of the GNU General Public License 19 along with GCC; see the file COPYING3. If not see 20 <http://www.gnu.org/licenses/>. */ 21 22 #include "config.h" 23 #include "system.h" 24 #include "coretypes.h" 25 #include "backend.h" 26 #include "target.h" 27 #include "rtl.h" 28 #include "tree.h" 29 #include "gimple.h" 30 #include "cfghooks.h" 31 #include "tree-pass.h" 32 #include "ssa.h" 33 #include "optabs-tree.h" 34 #include "diagnostic-core.h" 35 #include "fold-const.h" 36 #include "stor-layout.h" 37 #include "cfganal.h" 38 #include "gimplify.h" 39 #include "gimple-iterator.h" 40 #include "gimplify-me.h" 41 #include "tree-ssa-loop-ivopts.h" 42 #include "tree-ssa-loop-manip.h" 43 #include "tree-ssa-loop-niter.h" 44 #include "tree-ssa-loop.h" 45 #include "cfgloop.h" 46 #include "params.h" 47 #include "tree-scalar-evolution.h" 48 #include "tree-vectorizer.h" 49 #include "gimple-fold.h" 50 #include "cgraph.h" 51 #include "tree-cfg.h" 52 #include "tree-if-conv.h" 53 #include "internal-fn.h" 54 #include "tree-vector-builder.h" 55 #include "vec-perm-indices.h" 56 #include "tree-eh.h" 57 58 /* Loop Vectorization Pass. 59 60 This pass tries to vectorize loops. 61 62 For example, the vectorizer transforms the following simple loop: 63 64 short a[N]; short b[N]; short c[N]; int i; 65 66 for (i=0; i<N; i++){ 67 a[i] = b[i] + c[i]; 68 } 69 70 as if it was manually vectorized by rewriting the source code into: 71 72 typedef int __attribute__((mode(V8HI))) v8hi; 73 short a[N]; short b[N]; short c[N]; int i; 74 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c; 75 v8hi va, vb, vc; 76 77 for (i=0; i<N/8; i++){ 78 vb = pb[i]; 79 vc = pc[i]; 80 va = vb + vc; 81 pa[i] = va; 82 } 83 84 The main entry to this pass is vectorize_loops(), in which 85 the vectorizer applies a set of analyses on a given set of loops, 86 followed by the actual vectorization transformation for the loops that 87 had successfully passed the analysis phase. 88 Throughout this pass we make a distinction between two types of 89 data: scalars (which are represented by SSA_NAMES), and memory references 90 ("data-refs"). These two types of data require different handling both 91 during analysis and transformation. The types of data-refs that the 92 vectorizer currently supports are ARRAY_REFS which base is an array DECL 93 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer 94 accesses are required to have a simple (consecutive) access pattern. 95 96 Analysis phase: 97 =============== 98 The driver for the analysis phase is vect_analyze_loop(). 99 It applies a set of analyses, some of which rely on the scalar evolution 100 analyzer (scev) developed by Sebastian Pop. 101 102 During the analysis phase the vectorizer records some information 103 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the 104 loop, as well as general information about the loop as a whole, which is 105 recorded in a "loop_vec_info" struct attached to each loop. 106 107 Transformation phase: 108 ===================== 109 The loop transformation phase scans all the stmts in the loop, and 110 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in 111 the loop that needs to be vectorized. It inserts the vector code sequence 112 just before the scalar stmt S, and records a pointer to the vector code 113 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct 114 attached to S). This pointer will be used for the vectorization of following 115 stmts which use the def of stmt S. Stmt S is removed if it writes to memory; 116 otherwise, we rely on dead code elimination for removing it. 117 118 For example, say stmt S1 was vectorized into stmt VS1: 119 120 VS1: vb = px[i]; 121 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1 122 S2: a = b; 123 124 To vectorize stmt S2, the vectorizer first finds the stmt that defines 125 the operand 'b' (S1), and gets the relevant vector def 'vb' from the 126 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The 127 resulting sequence would be: 128 129 VS1: vb = px[i]; 130 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1 131 VS2: va = vb; 132 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2 133 134 Operands that are not SSA_NAMEs, are data-refs that appear in 135 load/store operations (like 'x[i]' in S1), and are handled differently. 136 137 Target modeling: 138 ================= 139 Currently the only target specific information that is used is the 140 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". 141 Targets that can support different sizes of vectors, for now will need 142 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More 143 flexibility will be added in the future. 144 145 Since we only vectorize operations which vector form can be 146 expressed using existing tree codes, to verify that an operation is 147 supported, the vectorizer checks the relevant optab at the relevant 148 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If 149 the value found is CODE_FOR_nothing, then there's no target support, and 150 we can't vectorize the stmt. 151 152 For additional information on this project see: 153 http://gcc.gnu.org/projects/tree-ssa/vectorization.html 154 */ 155 156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *); 157 158 /* Subroutine of vect_determine_vf_for_stmt that handles only one 159 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE 160 may already be set for general statements (not just data refs). */ 161 162 static opt_result 163 vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info, 164 bool vectype_maybe_set_p, 165 poly_uint64 *vf, 166 vec<stmt_vec_info > *mask_producers) 167 { 168 gimple *stmt = stmt_info->stmt; 169 170 if ((!STMT_VINFO_RELEVANT_P (stmt_info) 171 && !STMT_VINFO_LIVE_P (stmt_info)) 172 || gimple_clobber_p (stmt)) 173 { 174 if (dump_enabled_p ()) 175 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n"); 176 return opt_result::success (); 177 } 178 179 tree stmt_vectype, nunits_vectype; 180 opt_result res = vect_get_vector_types_for_stmt (stmt_info, &stmt_vectype, 181 &nunits_vectype); 182 if (!res) 183 return res; 184 185 if (stmt_vectype) 186 { 187 if (STMT_VINFO_VECTYPE (stmt_info)) 188 /* The only case when a vectype had been already set is for stmts 189 that contain a data ref, or for "pattern-stmts" (stmts generated 190 by the vectorizer to represent/replace a certain idiom). */ 191 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info) 192 || vectype_maybe_set_p) 193 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype); 194 else if (stmt_vectype == boolean_type_node) 195 mask_producers->safe_push (stmt_info); 196 else 197 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype; 198 } 199 200 if (nunits_vectype) 201 vect_update_max_nunits (vf, nunits_vectype); 202 203 return opt_result::success (); 204 } 205 206 /* Subroutine of vect_determine_vectorization_factor. Set the vector 207 types of STMT_INFO and all attached pattern statements and update 208 the vectorization factor VF accordingly. If some of the statements 209 produce a mask result whose vector type can only be calculated later, 210 add them to MASK_PRODUCERS. Return true on success or false if 211 something prevented vectorization. */ 212 213 static opt_result 214 vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf, 215 vec<stmt_vec_info > *mask_producers) 216 { 217 vec_info *vinfo = stmt_info->vinfo; 218 if (dump_enabled_p ()) 219 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G", 220 stmt_info->stmt); 221 opt_result res 222 = vect_determine_vf_for_stmt_1 (stmt_info, false, vf, mask_producers); 223 if (!res) 224 return res; 225 226 if (STMT_VINFO_IN_PATTERN_P (stmt_info) 227 && STMT_VINFO_RELATED_STMT (stmt_info)) 228 { 229 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info); 230 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info); 231 232 /* If a pattern statement has def stmts, analyze them too. */ 233 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq); 234 !gsi_end_p (si); gsi_next (&si)) 235 { 236 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si)); 237 if (dump_enabled_p ()) 238 dump_printf_loc (MSG_NOTE, vect_location, 239 "==> examining pattern def stmt: %G", 240 def_stmt_info->stmt); 241 if (!vect_determine_vf_for_stmt_1 (def_stmt_info, true, 242 vf, mask_producers)) 243 res = vect_determine_vf_for_stmt_1 (def_stmt_info, true, 244 vf, mask_producers); 245 if (!res) 246 return res; 247 } 248 249 if (dump_enabled_p ()) 250 dump_printf_loc (MSG_NOTE, vect_location, 251 "==> examining pattern statement: %G", 252 stmt_info->stmt); 253 res = vect_determine_vf_for_stmt_1 (stmt_info, true, vf, mask_producers); 254 if (!res) 255 return res; 256 } 257 258 return opt_result::success (); 259 } 260 261 /* Function vect_determine_vectorization_factor 262 263 Determine the vectorization factor (VF). VF is the number of data elements 264 that are operated upon in parallel in a single iteration of the vectorized 265 loop. For example, when vectorizing a loop that operates on 4byte elements, 266 on a target with vector size (VS) 16byte, the VF is set to 4, since 4 267 elements can fit in a single vector register. 268 269 We currently support vectorization of loops in which all types operated upon 270 are of the same size. Therefore this function currently sets VF according to 271 the size of the types operated upon, and fails if there are multiple sizes 272 in the loop. 273 274 VF is also the factor by which the loop iterations are strip-mined, e.g.: 275 original loop: 276 for (i=0; i<N; i++){ 277 a[i] = b[i] + c[i]; 278 } 279 280 vectorized loop: 281 for (i=0; i<N; i+=VF){ 282 a[i:VF] = b[i:VF] + c[i:VF]; 283 } 284 */ 285 286 static opt_result 287 vect_determine_vectorization_factor (loop_vec_info loop_vinfo) 288 { 289 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 290 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); 291 unsigned nbbs = loop->num_nodes; 292 poly_uint64 vectorization_factor = 1; 293 tree scalar_type = NULL_TREE; 294 gphi *phi; 295 tree vectype; 296 stmt_vec_info stmt_info; 297 unsigned i; 298 auto_vec<stmt_vec_info> mask_producers; 299 300 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor"); 301 302 for (i = 0; i < nbbs; i++) 303 { 304 basic_block bb = bbs[i]; 305 306 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si); 307 gsi_next (&si)) 308 { 309 phi = si.phi (); 310 stmt_info = loop_vinfo->lookup_stmt (phi); 311 if (dump_enabled_p ()) 312 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G", 313 phi); 314 315 gcc_assert (stmt_info); 316 317 if (STMT_VINFO_RELEVANT_P (stmt_info) 318 || STMT_VINFO_LIVE_P (stmt_info)) 319 { 320 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info)); 321 scalar_type = TREE_TYPE (PHI_RESULT (phi)); 322 323 if (dump_enabled_p ()) 324 dump_printf_loc (MSG_NOTE, vect_location, 325 "get vectype for scalar type: %T\n", 326 scalar_type); 327 328 vectype = get_vectype_for_scalar_type (scalar_type); 329 if (!vectype) 330 return opt_result::failure_at (phi, 331 "not vectorized: unsupported " 332 "data-type %T\n", 333 scalar_type); 334 STMT_VINFO_VECTYPE (stmt_info) = vectype; 335 336 if (dump_enabled_p ()) 337 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n", 338 vectype); 339 340 if (dump_enabled_p ()) 341 { 342 dump_printf_loc (MSG_NOTE, vect_location, "nunits = "); 343 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype)); 344 dump_printf (MSG_NOTE, "\n"); 345 } 346 347 vect_update_max_nunits (&vectorization_factor, vectype); 348 } 349 } 350 351 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si); 352 gsi_next (&si)) 353 { 354 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si)); 355 opt_result res 356 = vect_determine_vf_for_stmt (stmt_info, &vectorization_factor, 357 &mask_producers); 358 if (!res) 359 return res; 360 } 361 } 362 363 /* TODO: Analyze cost. Decide if worth while to vectorize. */ 364 if (dump_enabled_p ()) 365 { 366 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = "); 367 dump_dec (MSG_NOTE, vectorization_factor); 368 dump_printf (MSG_NOTE, "\n"); 369 } 370 371 if (known_le (vectorization_factor, 1U)) 372 return opt_result::failure_at (vect_location, 373 "not vectorized: unsupported data-type\n"); 374 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor; 375 376 for (i = 0; i < mask_producers.length (); i++) 377 { 378 stmt_info = mask_producers[i]; 379 opt_tree mask_type = vect_get_mask_type_for_stmt (stmt_info); 380 if (!mask_type) 381 return opt_result::propagate_failure (mask_type); 382 STMT_VINFO_VECTYPE (stmt_info) = mask_type; 383 } 384 385 return opt_result::success (); 386 } 387 388 389 /* Function vect_is_simple_iv_evolution. 390 391 FORNOW: A simple evolution of an induction variables in the loop is 392 considered a polynomial evolution. */ 393 394 static bool 395 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init, 396 tree * step) 397 { 398 tree init_expr; 399 tree step_expr; 400 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb); 401 basic_block bb; 402 403 /* When there is no evolution in this loop, the evolution function 404 is not "simple". */ 405 if (evolution_part == NULL_TREE) 406 return false; 407 408 /* When the evolution is a polynomial of degree >= 2 409 the evolution function is not "simple". */ 410 if (tree_is_chrec (evolution_part)) 411 return false; 412 413 step_expr = evolution_part; 414 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb)); 415 416 if (dump_enabled_p ()) 417 dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n", 418 step_expr, init_expr); 419 420 *init = init_expr; 421 *step = step_expr; 422 423 if (TREE_CODE (step_expr) != INTEGER_CST 424 && (TREE_CODE (step_expr) != SSA_NAME 425 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr))) 426 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb)) 427 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr)) 428 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)) 429 || !flag_associative_math))) 430 && (TREE_CODE (step_expr) != REAL_CST 431 || !flag_associative_math)) 432 { 433 if (dump_enabled_p ()) 434 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 435 "step unknown.\n"); 436 return false; 437 } 438 439 return true; 440 } 441 442 /* Return true if PHI, described by STMT_INFO, is the inner PHI in 443 what we are assuming is a double reduction. For example, given 444 a structure like this: 445 446 outer1: 447 x_1 = PHI <x_4(outer2), ...>; 448 ... 449 450 inner: 451 x_2 = PHI <x_1(outer1), ...>; 452 ... 453 x_3 = ...; 454 ... 455 456 outer2: 457 x_4 = PHI <x_3(inner)>; 458 ... 459 460 outer loop analysis would treat x_1 as a double reduction phi and 461 this function would then return true for x_2. */ 462 463 static bool 464 vect_inner_phi_in_double_reduction_p (stmt_vec_info stmt_info, gphi *phi) 465 { 466 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 467 use_operand_p use_p; 468 ssa_op_iter op_iter; 469 FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE) 470 if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p))) 471 if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def) 472 return true; 473 return false; 474 } 475 476 /* Function vect_analyze_scalar_cycles_1. 477 478 Examine the cross iteration def-use cycles of scalar variables 479 in LOOP. LOOP_VINFO represents the loop that is now being 480 considered for vectorization (can be LOOP, or an outer-loop 481 enclosing LOOP). */ 482 483 static void 484 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop) 485 { 486 basic_block bb = loop->header; 487 tree init, step; 488 auto_vec<stmt_vec_info, 64> worklist; 489 gphi_iterator gsi; 490 bool double_reduc; 491 492 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles"); 493 494 /* First - identify all inductions. Reduction detection assumes that all the 495 inductions have been identified, therefore, this order must not be 496 changed. */ 497 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi)) 498 { 499 gphi *phi = gsi.phi (); 500 tree access_fn = NULL; 501 tree def = PHI_RESULT (phi); 502 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi); 503 504 if (dump_enabled_p ()) 505 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi); 506 507 /* Skip virtual phi's. The data dependences that are associated with 508 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */ 509 if (virtual_operand_p (def)) 510 continue; 511 512 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type; 513 514 /* Analyze the evolution function. */ 515 access_fn = analyze_scalar_evolution (loop, def); 516 if (access_fn) 517 { 518 STRIP_NOPS (access_fn); 519 if (dump_enabled_p ()) 520 dump_printf_loc (MSG_NOTE, vect_location, 521 "Access function of PHI: %T\n", access_fn); 522 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo) 523 = initial_condition_in_loop_num (access_fn, loop->num); 524 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) 525 = evolution_part_in_loop_num (access_fn, loop->num); 526 } 527 528 if (!access_fn 529 || vect_inner_phi_in_double_reduction_p (stmt_vinfo, phi) 530 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step) 531 || (LOOP_VINFO_LOOP (loop_vinfo) != loop 532 && TREE_CODE (step) != INTEGER_CST)) 533 { 534 worklist.safe_push (stmt_vinfo); 535 continue; 536 } 537 538 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo) 539 != NULL_TREE); 540 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE); 541 542 if (dump_enabled_p ()) 543 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n"); 544 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def; 545 } 546 547 548 /* Second - identify all reductions and nested cycles. */ 549 while (worklist.length () > 0) 550 { 551 stmt_vec_info stmt_vinfo = worklist.pop (); 552 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt); 553 tree def = PHI_RESULT (phi); 554 555 if (dump_enabled_p ()) 556 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi); 557 558 gcc_assert (!virtual_operand_p (def) 559 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type); 560 561 stmt_vec_info reduc_stmt_info 562 = vect_force_simple_reduction (loop_vinfo, stmt_vinfo, 563 &double_reduc, false); 564 if (reduc_stmt_info) 565 { 566 if (double_reduc) 567 { 568 if (dump_enabled_p ()) 569 dump_printf_loc (MSG_NOTE, vect_location, 570 "Detected double reduction.\n"); 571 572 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def; 573 STMT_VINFO_DEF_TYPE (reduc_stmt_info) 574 = vect_double_reduction_def; 575 } 576 else 577 { 578 if (loop != LOOP_VINFO_LOOP (loop_vinfo)) 579 { 580 if (dump_enabled_p ()) 581 dump_printf_loc (MSG_NOTE, vect_location, 582 "Detected vectorizable nested cycle.\n"); 583 584 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle; 585 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_nested_cycle; 586 } 587 else 588 { 589 if (dump_enabled_p ()) 590 dump_printf_loc (MSG_NOTE, vect_location, 591 "Detected reduction.\n"); 592 593 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def; 594 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def; 595 /* Store the reduction cycles for possible vectorization in 596 loop-aware SLP if it was not detected as reduction 597 chain. */ 598 if (! REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info)) 599 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push 600 (reduc_stmt_info); 601 } 602 } 603 } 604 else 605 if (dump_enabled_p ()) 606 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 607 "Unknown def-use cycle pattern.\n"); 608 } 609 } 610 611 612 /* Function vect_analyze_scalar_cycles. 613 614 Examine the cross iteration def-use cycles of scalar variables, by 615 analyzing the loop-header PHIs of scalar variables. Classify each 616 cycle as one of the following: invariant, induction, reduction, unknown. 617 We do that for the loop represented by LOOP_VINFO, and also to its 618 inner-loop, if exists. 619 Examples for scalar cycles: 620 621 Example1: reduction: 622 623 loop1: 624 for (i=0; i<N; i++) 625 sum += a[i]; 626 627 Example2: induction: 628 629 loop2: 630 for (i=0; i<N; i++) 631 a[i] = i; */ 632 633 static void 634 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo) 635 { 636 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 637 638 vect_analyze_scalar_cycles_1 (loop_vinfo, loop); 639 640 /* When vectorizing an outer-loop, the inner-loop is executed sequentially. 641 Reductions in such inner-loop therefore have different properties than 642 the reductions in the nest that gets vectorized: 643 1. When vectorized, they are executed in the same order as in the original 644 scalar loop, so we can't change the order of computation when 645 vectorizing them. 646 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the 647 current checks are too strict. */ 648 649 if (loop->inner) 650 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner); 651 } 652 653 /* Transfer group and reduction information from STMT_INFO to its 654 pattern stmt. */ 655 656 static void 657 vect_fixup_reduc_chain (stmt_vec_info stmt_info) 658 { 659 stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info); 660 stmt_vec_info stmtp; 661 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp) 662 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)); 663 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info); 664 do 665 { 666 stmtp = STMT_VINFO_RELATED_STMT (stmt_info); 667 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp; 668 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info); 669 if (stmt_info) 670 REDUC_GROUP_NEXT_ELEMENT (stmtp) 671 = STMT_VINFO_RELATED_STMT (stmt_info); 672 } 673 while (stmt_info); 674 STMT_VINFO_DEF_TYPE (stmtp) = vect_reduction_def; 675 } 676 677 /* Fixup scalar cycles that now have their stmts detected as patterns. */ 678 679 static void 680 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo) 681 { 682 stmt_vec_info first; 683 unsigned i; 684 685 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first) 686 if (STMT_VINFO_IN_PATTERN_P (first)) 687 { 688 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first); 689 while (next) 690 { 691 if (! STMT_VINFO_IN_PATTERN_P (next)) 692 break; 693 next = REDUC_GROUP_NEXT_ELEMENT (next); 694 } 695 /* If not all stmt in the chain are patterns try to handle 696 the chain without patterns. */ 697 if (! next) 698 { 699 vect_fixup_reduc_chain (first); 700 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i] 701 = STMT_VINFO_RELATED_STMT (first); 702 } 703 } 704 } 705 706 /* Function vect_get_loop_niters. 707 708 Determine how many iterations the loop is executed and place it 709 in NUMBER_OF_ITERATIONS. Place the number of latch iterations 710 in NUMBER_OF_ITERATIONSM1. Place the condition under which the 711 niter information holds in ASSUMPTIONS. 712 713 Return the loop exit condition. */ 714 715 716 static gcond * 717 vect_get_loop_niters (struct loop *loop, tree *assumptions, 718 tree *number_of_iterations, tree *number_of_iterationsm1) 719 { 720 edge exit = single_exit (loop); 721 struct tree_niter_desc niter_desc; 722 tree niter_assumptions, niter, may_be_zero; 723 gcond *cond = get_loop_exit_condition (loop); 724 725 *assumptions = boolean_true_node; 726 *number_of_iterationsm1 = chrec_dont_know; 727 *number_of_iterations = chrec_dont_know; 728 DUMP_VECT_SCOPE ("get_loop_niters"); 729 730 if (!exit) 731 return cond; 732 733 niter = chrec_dont_know; 734 may_be_zero = NULL_TREE; 735 niter_assumptions = boolean_true_node; 736 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL) 737 || chrec_contains_undetermined (niter_desc.niter)) 738 return cond; 739 740 niter_assumptions = niter_desc.assumptions; 741 may_be_zero = niter_desc.may_be_zero; 742 niter = niter_desc.niter; 743 744 if (may_be_zero && integer_zerop (may_be_zero)) 745 may_be_zero = NULL_TREE; 746 747 if (may_be_zero) 748 { 749 if (COMPARISON_CLASS_P (may_be_zero)) 750 { 751 /* Try to combine may_be_zero with assumptions, this can simplify 752 computation of niter expression. */ 753 if (niter_assumptions && !integer_nonzerop (niter_assumptions)) 754 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node, 755 niter_assumptions, 756 fold_build1 (TRUTH_NOT_EXPR, 757 boolean_type_node, 758 may_be_zero)); 759 else 760 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero, 761 build_int_cst (TREE_TYPE (niter), 0), 762 rewrite_to_non_trapping_overflow (niter)); 763 764 may_be_zero = NULL_TREE; 765 } 766 else if (integer_nonzerop (may_be_zero)) 767 { 768 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0); 769 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1); 770 return cond; 771 } 772 else 773 return cond; 774 } 775 776 *assumptions = niter_assumptions; 777 *number_of_iterationsm1 = niter; 778 779 /* We want the number of loop header executions which is the number 780 of latch executions plus one. 781 ??? For UINT_MAX latch executions this number overflows to zero 782 for loops like do { n++; } while (n != 0); */ 783 if (niter && !chrec_contains_undetermined (niter)) 784 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter), 785 build_int_cst (TREE_TYPE (niter), 1)); 786 *number_of_iterations = niter; 787 788 return cond; 789 } 790 791 /* Function bb_in_loop_p 792 793 Used as predicate for dfs order traversal of the loop bbs. */ 794 795 static bool 796 bb_in_loop_p (const_basic_block bb, const void *data) 797 { 798 const struct loop *const loop = (const struct loop *)data; 799 if (flow_bb_inside_loop_p (loop, bb)) 800 return true; 801 return false; 802 } 803 804 805 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as 806 stmt_vec_info structs for all the stmts in LOOP_IN. */ 807 808 _loop_vec_info::_loop_vec_info (struct loop *loop_in, vec_info_shared *shared) 809 : vec_info (vec_info::loop, init_cost (loop_in), shared), 810 loop (loop_in), 811 bbs (XCNEWVEC (basic_block, loop->num_nodes)), 812 num_itersm1 (NULL_TREE), 813 num_iters (NULL_TREE), 814 num_iters_unchanged (NULL_TREE), 815 num_iters_assumptions (NULL_TREE), 816 th (0), 817 versioning_threshold (0), 818 vectorization_factor (0), 819 max_vectorization_factor (0), 820 mask_skip_niters (NULL_TREE), 821 mask_compare_type (NULL_TREE), 822 simd_if_cond (NULL_TREE), 823 unaligned_dr (NULL), 824 peeling_for_alignment (0), 825 ptr_mask (0), 826 ivexpr_map (NULL), 827 slp_unrolling_factor (1), 828 single_scalar_iteration_cost (0), 829 vectorizable (false), 830 can_fully_mask_p (true), 831 fully_masked_p (false), 832 peeling_for_gaps (false), 833 peeling_for_niter (false), 834 operands_swapped (false), 835 no_data_dependencies (false), 836 has_mask_store (false), 837 scalar_loop (NULL), 838 orig_loop_info (NULL) 839 { 840 /* CHECKME: We want to visit all BBs before their successors (except for 841 latch blocks, for which this assertion wouldn't hold). In the simple 842 case of the loop forms we allow, a dfs order of the BBs would the same 843 as reversed postorder traversal, so we are safe. */ 844 845 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p, 846 bbs, loop->num_nodes, loop); 847 gcc_assert (nbbs == loop->num_nodes); 848 849 for (unsigned int i = 0; i < nbbs; i++) 850 { 851 basic_block bb = bbs[i]; 852 gimple_stmt_iterator si; 853 854 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si)) 855 { 856 gimple *phi = gsi_stmt (si); 857 gimple_set_uid (phi, 0); 858 add_stmt (phi); 859 } 860 861 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si)) 862 { 863 gimple *stmt = gsi_stmt (si); 864 gimple_set_uid (stmt, 0); 865 add_stmt (stmt); 866 /* If .GOMP_SIMD_LANE call for the current loop has 2 arguments, the 867 second argument is the #pragma omp simd if (x) condition, when 0, 868 loop shouldn't be vectorized, when non-zero constant, it should 869 be vectorized normally, otherwise versioned with vectorized loop 870 done if the condition is non-zero at runtime. */ 871 if (loop_in->simduid 872 && is_gimple_call (stmt) 873 && gimple_call_internal_p (stmt) 874 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE 875 && gimple_call_num_args (stmt) >= 2 876 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME 877 && (loop_in->simduid 878 == SSA_NAME_VAR (gimple_call_arg (stmt, 0)))) 879 { 880 tree arg = gimple_call_arg (stmt, 1); 881 if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME) 882 simd_if_cond = arg; 883 else 884 gcc_assert (integer_nonzerop (arg)); 885 } 886 } 887 } 888 } 889 890 /* Free all levels of MASKS. */ 891 892 void 893 release_vec_loop_masks (vec_loop_masks *masks) 894 { 895 rgroup_masks *rgm; 896 unsigned int i; 897 FOR_EACH_VEC_ELT (*masks, i, rgm) 898 rgm->masks.release (); 899 masks->release (); 900 } 901 902 /* Free all memory used by the _loop_vec_info, as well as all the 903 stmt_vec_info structs of all the stmts in the loop. */ 904 905 _loop_vec_info::~_loop_vec_info () 906 { 907 int nbbs; 908 gimple_stmt_iterator si; 909 int j; 910 911 nbbs = loop->num_nodes; 912 for (j = 0; j < nbbs; j++) 913 { 914 basic_block bb = bbs[j]; 915 for (si = gsi_start_bb (bb); !gsi_end_p (si); ) 916 { 917 gimple *stmt = gsi_stmt (si); 918 919 /* We may have broken canonical form by moving a constant 920 into RHS1 of a commutative op. Fix such occurrences. */ 921 if (operands_swapped && is_gimple_assign (stmt)) 922 { 923 enum tree_code code = gimple_assign_rhs_code (stmt); 924 925 if ((code == PLUS_EXPR 926 || code == POINTER_PLUS_EXPR 927 || code == MULT_EXPR) 928 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt))) 929 swap_ssa_operands (stmt, 930 gimple_assign_rhs1_ptr (stmt), 931 gimple_assign_rhs2_ptr (stmt)); 932 else if (code == COND_EXPR 933 && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt))) 934 { 935 tree cond_expr = gimple_assign_rhs1 (stmt); 936 enum tree_code cond_code = TREE_CODE (cond_expr); 937 938 if (TREE_CODE_CLASS (cond_code) == tcc_comparison) 939 { 940 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 941 0)); 942 cond_code = invert_tree_comparison (cond_code, 943 honor_nans); 944 if (cond_code != ERROR_MARK) 945 { 946 TREE_SET_CODE (cond_expr, cond_code); 947 swap_ssa_operands (stmt, 948 gimple_assign_rhs2_ptr (stmt), 949 gimple_assign_rhs3_ptr (stmt)); 950 } 951 } 952 } 953 } 954 gsi_next (&si); 955 } 956 } 957 958 free (bbs); 959 960 release_vec_loop_masks (&masks); 961 delete ivexpr_map; 962 963 loop->aux = NULL; 964 } 965 966 /* Return an invariant or register for EXPR and emit necessary 967 computations in the LOOP_VINFO loop preheader. */ 968 969 tree 970 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr) 971 { 972 if (is_gimple_reg (expr) 973 || is_gimple_min_invariant (expr)) 974 return expr; 975 976 if (! loop_vinfo->ivexpr_map) 977 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>; 978 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr); 979 if (! cached) 980 { 981 gimple_seq stmts = NULL; 982 cached = force_gimple_operand (unshare_expr (expr), 983 &stmts, true, NULL_TREE); 984 if (stmts) 985 { 986 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo)); 987 gsi_insert_seq_on_edge_immediate (e, stmts); 988 } 989 } 990 return cached; 991 } 992 993 /* Return true if we can use CMP_TYPE as the comparison type to produce 994 all masks required to mask LOOP_VINFO. */ 995 996 static bool 997 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type) 998 { 999 rgroup_masks *rgm; 1000 unsigned int i; 1001 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm) 1002 if (rgm->mask_type != NULL_TREE 1003 && !direct_internal_fn_supported_p (IFN_WHILE_ULT, 1004 cmp_type, rgm->mask_type, 1005 OPTIMIZE_FOR_SPEED)) 1006 return false; 1007 return true; 1008 } 1009 1010 /* Calculate the maximum number of scalars per iteration for every 1011 rgroup in LOOP_VINFO. */ 1012 1013 static unsigned int 1014 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo) 1015 { 1016 unsigned int res = 1; 1017 unsigned int i; 1018 rgroup_masks *rgm; 1019 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm) 1020 res = MAX (res, rgm->max_nscalars_per_iter); 1021 return res; 1022 } 1023 1024 /* Each statement in LOOP_VINFO can be masked where necessary. Check 1025 whether we can actually generate the masks required. Return true if so, 1026 storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE. */ 1027 1028 static bool 1029 vect_verify_full_masking (loop_vec_info loop_vinfo) 1030 { 1031 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 1032 unsigned int min_ni_width; 1033 1034 /* Use a normal loop if there are no statements that need masking. 1035 This only happens in rare degenerate cases: it means that the loop 1036 has no loads, no stores, and no live-out values. */ 1037 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ()) 1038 return false; 1039 1040 /* Get the maximum number of iterations that is representable 1041 in the counter type. */ 1042 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo)); 1043 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1; 1044 1045 /* Get a more refined estimate for the number of iterations. */ 1046 widest_int max_back_edges; 1047 if (max_loop_iterations (loop, &max_back_edges)) 1048 max_ni = wi::smin (max_ni, max_back_edges + 1); 1049 1050 /* Account for rgroup masks, in which each bit is replicated N times. */ 1051 max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo); 1052 1053 /* Work out how many bits we need to represent the limit. */ 1054 min_ni_width = wi::min_precision (max_ni, UNSIGNED); 1055 1056 /* Find a scalar mode for which WHILE_ULT is supported. */ 1057 opt_scalar_int_mode cmp_mode_iter; 1058 tree cmp_type = NULL_TREE; 1059 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT) 1060 { 1061 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ()); 1062 if (cmp_bits >= min_ni_width 1063 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ())) 1064 { 1065 tree this_type = build_nonstandard_integer_type (cmp_bits, true); 1066 if (this_type 1067 && can_produce_all_loop_masks_p (loop_vinfo, this_type)) 1068 { 1069 /* Although we could stop as soon as we find a valid mode, 1070 it's often better to continue until we hit Pmode, since the 1071 operands to the WHILE are more likely to be reusable in 1072 address calculations. */ 1073 cmp_type = this_type; 1074 if (cmp_bits >= GET_MODE_BITSIZE (Pmode)) 1075 break; 1076 } 1077 } 1078 } 1079 1080 if (!cmp_type) 1081 return false; 1082 1083 LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type; 1084 return true; 1085 } 1086 1087 /* Calculate the cost of one scalar iteration of the loop. */ 1088 static void 1089 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo) 1090 { 1091 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 1092 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); 1093 int nbbs = loop->num_nodes, factor; 1094 int innerloop_iters, i; 1095 1096 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost"); 1097 1098 /* Gather costs for statements in the scalar loop. */ 1099 1100 /* FORNOW. */ 1101 innerloop_iters = 1; 1102 if (loop->inner) 1103 innerloop_iters = 50; /* FIXME */ 1104 1105 for (i = 0; i < nbbs; i++) 1106 { 1107 gimple_stmt_iterator si; 1108 basic_block bb = bbs[i]; 1109 1110 if (bb->loop_father == loop->inner) 1111 factor = innerloop_iters; 1112 else 1113 factor = 1; 1114 1115 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si)) 1116 { 1117 gimple *stmt = gsi_stmt (si); 1118 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt); 1119 1120 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt)) 1121 continue; 1122 1123 /* Skip stmts that are not vectorized inside the loop. */ 1124 stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info); 1125 if (!STMT_VINFO_RELEVANT_P (vstmt_info) 1126 && (!STMT_VINFO_LIVE_P (vstmt_info) 1127 || !VECTORIZABLE_CYCLE_DEF 1128 (STMT_VINFO_DEF_TYPE (vstmt_info)))) 1129 continue; 1130 1131 vect_cost_for_stmt kind; 1132 if (STMT_VINFO_DATA_REF (stmt_info)) 1133 { 1134 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))) 1135 kind = scalar_load; 1136 else 1137 kind = scalar_store; 1138 } 1139 else 1140 kind = scalar_stmt; 1141 1142 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), 1143 factor, kind, stmt_info, 0, vect_prologue); 1144 } 1145 } 1146 1147 /* Now accumulate cost. */ 1148 void *target_cost_data = init_cost (loop); 1149 stmt_info_for_cost *si; 1150 int j; 1151 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), 1152 j, si) 1153 (void) add_stmt_cost (target_cost_data, si->count, 1154 si->kind, si->stmt_info, si->misalign, 1155 vect_body); 1156 unsigned dummy, body_cost = 0; 1157 finish_cost (target_cost_data, &dummy, &body_cost, &dummy); 1158 destroy_cost_data (target_cost_data); 1159 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost; 1160 } 1161 1162 1163 /* Function vect_analyze_loop_form_1. 1164 1165 Verify that certain CFG restrictions hold, including: 1166 - the loop has a pre-header 1167 - the loop has a single entry and exit 1168 - the loop exit condition is simple enough 1169 - the number of iterations can be analyzed, i.e, a countable loop. The 1170 niter could be analyzed under some assumptions. */ 1171 1172 opt_result 1173 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond, 1174 tree *assumptions, tree *number_of_iterationsm1, 1175 tree *number_of_iterations, gcond **inner_loop_cond) 1176 { 1177 DUMP_VECT_SCOPE ("vect_analyze_loop_form"); 1178 1179 /* Different restrictions apply when we are considering an inner-most loop, 1180 vs. an outer (nested) loop. 1181 (FORNOW. May want to relax some of these restrictions in the future). */ 1182 1183 if (!loop->inner) 1184 { 1185 /* Inner-most loop. We currently require that the number of BBs is 1186 exactly 2 (the header and latch). Vectorizable inner-most loops 1187 look like this: 1188 1189 (pre-header) 1190 | 1191 header <--------+ 1192 | | | 1193 | +--> latch --+ 1194 | 1195 (exit-bb) */ 1196 1197 if (loop->num_nodes != 2) 1198 return opt_result::failure_at (vect_location, 1199 "not vectorized:" 1200 " control flow in loop.\n"); 1201 1202 if (empty_block_p (loop->header)) 1203 return opt_result::failure_at (vect_location, 1204 "not vectorized: empty loop.\n"); 1205 } 1206 else 1207 { 1208 struct loop *innerloop = loop->inner; 1209 edge entryedge; 1210 1211 /* Nested loop. We currently require that the loop is doubly-nested, 1212 contains a single inner loop, and the number of BBs is exactly 5. 1213 Vectorizable outer-loops look like this: 1214 1215 (pre-header) 1216 | 1217 header <---+ 1218 | | 1219 inner-loop | 1220 | | 1221 tail ------+ 1222 | 1223 (exit-bb) 1224 1225 The inner-loop has the properties expected of inner-most loops 1226 as described above. */ 1227 1228 if ((loop->inner)->inner || (loop->inner)->next) 1229 return opt_result::failure_at (vect_location, 1230 "not vectorized:" 1231 " multiple nested loops.\n"); 1232 1233 if (loop->num_nodes != 5) 1234 return opt_result::failure_at (vect_location, 1235 "not vectorized:" 1236 " control flow in loop.\n"); 1237 1238 entryedge = loop_preheader_edge (innerloop); 1239 if (entryedge->src != loop->header 1240 || !single_exit (innerloop) 1241 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src) 1242 return opt_result::failure_at (vect_location, 1243 "not vectorized:" 1244 " unsupported outerloop form.\n"); 1245 1246 /* Analyze the inner-loop. */ 1247 tree inner_niterm1, inner_niter, inner_assumptions; 1248 opt_result res 1249 = vect_analyze_loop_form_1 (loop->inner, inner_loop_cond, 1250 &inner_assumptions, &inner_niterm1, 1251 &inner_niter, NULL); 1252 if (!res) 1253 { 1254 if (dump_enabled_p ()) 1255 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1256 "not vectorized: Bad inner loop.\n"); 1257 return res; 1258 } 1259 1260 /* Don't support analyzing niter under assumptions for inner 1261 loop. */ 1262 if (!integer_onep (inner_assumptions)) 1263 return opt_result::failure_at (vect_location, 1264 "not vectorized: Bad inner loop.\n"); 1265 1266 if (!expr_invariant_in_loop_p (loop, inner_niter)) 1267 return opt_result::failure_at (vect_location, 1268 "not vectorized: inner-loop count not" 1269 " invariant.\n"); 1270 1271 if (dump_enabled_p ()) 1272 dump_printf_loc (MSG_NOTE, vect_location, 1273 "Considering outer-loop vectorization.\n"); 1274 } 1275 1276 if (!single_exit (loop)) 1277 return opt_result::failure_at (vect_location, 1278 "not vectorized: multiple exits.\n"); 1279 if (EDGE_COUNT (loop->header->preds) != 2) 1280 return opt_result::failure_at (vect_location, 1281 "not vectorized:" 1282 " too many incoming edges.\n"); 1283 1284 /* We assume that the loop exit condition is at the end of the loop. i.e, 1285 that the loop is represented as a do-while (with a proper if-guard 1286 before the loop if needed), where the loop header contains all the 1287 executable statements, and the latch is empty. */ 1288 if (!empty_block_p (loop->latch) 1289 || !gimple_seq_empty_p (phi_nodes (loop->latch))) 1290 return opt_result::failure_at (vect_location, 1291 "not vectorized: latch block not empty.\n"); 1292 1293 /* Make sure the exit is not abnormal. */ 1294 edge e = single_exit (loop); 1295 if (e->flags & EDGE_ABNORMAL) 1296 return opt_result::failure_at (vect_location, 1297 "not vectorized:" 1298 " abnormal loop exit edge.\n"); 1299 1300 *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations, 1301 number_of_iterationsm1); 1302 if (!*loop_cond) 1303 return opt_result::failure_at 1304 (vect_location, 1305 "not vectorized: complicated exit condition.\n"); 1306 1307 if (integer_zerop (*assumptions) 1308 || !*number_of_iterations 1309 || chrec_contains_undetermined (*number_of_iterations)) 1310 return opt_result::failure_at 1311 (*loop_cond, 1312 "not vectorized: number of iterations cannot be computed.\n"); 1313 1314 if (integer_zerop (*number_of_iterations)) 1315 return opt_result::failure_at 1316 (*loop_cond, 1317 "not vectorized: number of iterations = 0.\n"); 1318 1319 return opt_result::success (); 1320 } 1321 1322 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */ 1323 1324 opt_loop_vec_info 1325 vect_analyze_loop_form (struct loop *loop, vec_info_shared *shared) 1326 { 1327 tree assumptions, number_of_iterations, number_of_iterationsm1; 1328 gcond *loop_cond, *inner_loop_cond = NULL; 1329 1330 opt_result res 1331 = vect_analyze_loop_form_1 (loop, &loop_cond, 1332 &assumptions, &number_of_iterationsm1, 1333 &number_of_iterations, &inner_loop_cond); 1334 if (!res) 1335 return opt_loop_vec_info::propagate_failure (res); 1336 1337 loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared); 1338 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1; 1339 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations; 1340 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations; 1341 if (!integer_onep (assumptions)) 1342 { 1343 /* We consider to vectorize this loop by versioning it under 1344 some assumptions. In order to do this, we need to clear 1345 existing information computed by scev and niter analyzer. */ 1346 scev_reset_htab (); 1347 free_numbers_of_iterations_estimates (loop); 1348 /* Also set flag for this loop so that following scev and niter 1349 analysis are done under the assumptions. */ 1350 loop_constraint_set (loop, LOOP_C_FINITE); 1351 /* Also record the assumptions for versioning. */ 1352 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions; 1353 } 1354 1355 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) 1356 { 1357 if (dump_enabled_p ()) 1358 { 1359 dump_printf_loc (MSG_NOTE, vect_location, 1360 "Symbolic number of iterations is "); 1361 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations); 1362 dump_printf (MSG_NOTE, "\n"); 1363 } 1364 } 1365 1366 stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond); 1367 STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type; 1368 if (inner_loop_cond) 1369 { 1370 stmt_vec_info inner_loop_cond_info 1371 = loop_vinfo->lookup_stmt (inner_loop_cond); 1372 STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type; 1373 } 1374 1375 gcc_assert (!loop->aux); 1376 loop->aux = loop_vinfo; 1377 return opt_loop_vec_info::success (loop_vinfo); 1378 } 1379 1380 1381 1382 /* Scan the loop stmts and dependent on whether there are any (non-)SLP 1383 statements update the vectorization factor. */ 1384 1385 static void 1386 vect_update_vf_for_slp (loop_vec_info loop_vinfo) 1387 { 1388 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 1389 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); 1390 int nbbs = loop->num_nodes; 1391 poly_uint64 vectorization_factor; 1392 int i; 1393 1394 DUMP_VECT_SCOPE ("vect_update_vf_for_slp"); 1395 1396 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 1397 gcc_assert (known_ne (vectorization_factor, 0U)); 1398 1399 /* If all the stmts in the loop can be SLPed, we perform only SLP, and 1400 vectorization factor of the loop is the unrolling factor required by 1401 the SLP instances. If that unrolling factor is 1, we say, that we 1402 perform pure SLP on loop - cross iteration parallelism is not 1403 exploited. */ 1404 bool only_slp_in_loop = true; 1405 for (i = 0; i < nbbs; i++) 1406 { 1407 basic_block bb = bbs[i]; 1408 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si); 1409 gsi_next (&si)) 1410 { 1411 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si)); 1412 stmt_info = vect_stmt_to_vectorize (stmt_info); 1413 if ((STMT_VINFO_RELEVANT_P (stmt_info) 1414 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info))) 1415 && !PURE_SLP_STMT (stmt_info)) 1416 /* STMT needs both SLP and loop-based vectorization. */ 1417 only_slp_in_loop = false; 1418 } 1419 } 1420 1421 if (only_slp_in_loop) 1422 { 1423 if (dump_enabled_p ()) 1424 dump_printf_loc (MSG_NOTE, vect_location, 1425 "Loop contains only SLP stmts\n"); 1426 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo); 1427 } 1428 else 1429 { 1430 if (dump_enabled_p ()) 1431 dump_printf_loc (MSG_NOTE, vect_location, 1432 "Loop contains SLP and non-SLP stmts\n"); 1433 /* Both the vectorization factor and unroll factor have the form 1434 current_vector_size * X for some rational X, so they must have 1435 a common multiple. */ 1436 vectorization_factor 1437 = force_common_multiple (vectorization_factor, 1438 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo)); 1439 } 1440 1441 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor; 1442 if (dump_enabled_p ()) 1443 { 1444 dump_printf_loc (MSG_NOTE, vect_location, 1445 "Updating vectorization factor to "); 1446 dump_dec (MSG_NOTE, vectorization_factor); 1447 dump_printf (MSG_NOTE, ".\n"); 1448 } 1449 } 1450 1451 /* Return true if STMT_INFO describes a double reduction phi and if 1452 the other phi in the reduction is also relevant for vectorization. 1453 This rejects cases such as: 1454 1455 outer1: 1456 x_1 = PHI <x_3(outer2), ...>; 1457 ... 1458 1459 inner: 1460 x_2 = ...; 1461 ... 1462 1463 outer2: 1464 x_3 = PHI <x_2(inner)>; 1465 1466 if nothing in x_2 or elsewhere makes x_1 relevant. */ 1467 1468 static bool 1469 vect_active_double_reduction_p (stmt_vec_info stmt_info) 1470 { 1471 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def) 1472 return false; 1473 1474 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info)); 1475 } 1476 1477 /* Function vect_analyze_loop_operations. 1478 1479 Scan the loop stmts and make sure they are all vectorizable. */ 1480 1481 static opt_result 1482 vect_analyze_loop_operations (loop_vec_info loop_vinfo) 1483 { 1484 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 1485 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); 1486 int nbbs = loop->num_nodes; 1487 int i; 1488 stmt_vec_info stmt_info; 1489 bool need_to_vectorize = false; 1490 bool ok; 1491 1492 DUMP_VECT_SCOPE ("vect_analyze_loop_operations"); 1493 1494 auto_vec<stmt_info_for_cost> cost_vec; 1495 1496 for (i = 0; i < nbbs; i++) 1497 { 1498 basic_block bb = bbs[i]; 1499 1500 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si); 1501 gsi_next (&si)) 1502 { 1503 gphi *phi = si.phi (); 1504 ok = true; 1505 1506 stmt_info = loop_vinfo->lookup_stmt (phi); 1507 if (dump_enabled_p ()) 1508 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi); 1509 if (virtual_operand_p (gimple_phi_result (phi))) 1510 continue; 1511 1512 /* Inner-loop loop-closed exit phi in outer-loop vectorization 1513 (i.e., a phi in the tail of the outer-loop). */ 1514 if (! is_loop_header_bb_p (bb)) 1515 { 1516 /* FORNOW: we currently don't support the case that these phis 1517 are not used in the outerloop (unless it is double reduction, 1518 i.e., this phi is vect_reduction_def), cause this case 1519 requires to actually do something here. */ 1520 if (STMT_VINFO_LIVE_P (stmt_info) 1521 && !vect_active_double_reduction_p (stmt_info)) 1522 return opt_result::failure_at (phi, 1523 "Unsupported loop-closed phi" 1524 " in outer-loop.\n"); 1525 1526 /* If PHI is used in the outer loop, we check that its operand 1527 is defined in the inner loop. */ 1528 if (STMT_VINFO_RELEVANT_P (stmt_info)) 1529 { 1530 tree phi_op; 1531 1532 if (gimple_phi_num_args (phi) != 1) 1533 return opt_result::failure_at (phi, "unsupported phi"); 1534 1535 phi_op = PHI_ARG_DEF (phi, 0); 1536 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op); 1537 if (!op_def_info) 1538 return opt_result::failure_at (phi, "unsupported phi"); 1539 1540 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer 1541 && (STMT_VINFO_RELEVANT (op_def_info) 1542 != vect_used_in_outer_by_reduction)) 1543 return opt_result::failure_at (phi, "unsupported phi"); 1544 } 1545 1546 continue; 1547 } 1548 1549 gcc_assert (stmt_info); 1550 1551 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope 1552 || STMT_VINFO_LIVE_P (stmt_info)) 1553 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def) 1554 /* A scalar-dependence cycle that we don't support. */ 1555 return opt_result::failure_at (phi, 1556 "not vectorized:" 1557 " scalar dependence cycle.\n"); 1558 1559 if (STMT_VINFO_RELEVANT_P (stmt_info)) 1560 { 1561 need_to_vectorize = true; 1562 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def 1563 && ! PURE_SLP_STMT (stmt_info)) 1564 ok = vectorizable_induction (stmt_info, NULL, NULL, NULL, 1565 &cost_vec); 1566 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def 1567 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle) 1568 && ! PURE_SLP_STMT (stmt_info)) 1569 ok = vectorizable_reduction (stmt_info, NULL, NULL, NULL, NULL, 1570 &cost_vec); 1571 } 1572 1573 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */ 1574 if (ok 1575 && STMT_VINFO_LIVE_P (stmt_info) 1576 && !PURE_SLP_STMT (stmt_info)) 1577 ok = vectorizable_live_operation (stmt_info, NULL, NULL, -1, NULL, 1578 &cost_vec); 1579 1580 if (!ok) 1581 return opt_result::failure_at (phi, 1582 "not vectorized: relevant phi not " 1583 "supported: %G", 1584 static_cast <gimple *> (phi)); 1585 } 1586 1587 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si); 1588 gsi_next (&si)) 1589 { 1590 gimple *stmt = gsi_stmt (si); 1591 if (!gimple_clobber_p (stmt)) 1592 { 1593 opt_result res 1594 = vect_analyze_stmt (loop_vinfo->lookup_stmt (stmt), 1595 &need_to_vectorize, 1596 NULL, NULL, &cost_vec); 1597 if (!res) 1598 return res; 1599 } 1600 } 1601 } /* bbs */ 1602 1603 add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec); 1604 1605 /* All operations in the loop are either irrelevant (deal with loop 1606 control, or dead), or only used outside the loop and can be moved 1607 out of the loop (e.g. invariants, inductions). The loop can be 1608 optimized away by scalar optimizations. We're better off not 1609 touching this loop. */ 1610 if (!need_to_vectorize) 1611 { 1612 if (dump_enabled_p ()) 1613 dump_printf_loc (MSG_NOTE, vect_location, 1614 "All the computation can be taken out of the loop.\n"); 1615 return opt_result::failure_at 1616 (vect_location, 1617 "not vectorized: redundant loop. no profit to vectorize.\n"); 1618 } 1619 1620 return opt_result::success (); 1621 } 1622 1623 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it 1624 is worthwhile to vectorize. Return 1 if definitely yes, 0 if 1625 definitely no, or -1 if it's worth retrying. */ 1626 1627 static int 1628 vect_analyze_loop_costing (loop_vec_info loop_vinfo) 1629 { 1630 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 1631 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo); 1632 1633 /* Only fully-masked loops can have iteration counts less than the 1634 vectorization factor. */ 1635 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) 1636 { 1637 HOST_WIDE_INT max_niter; 1638 1639 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) 1640 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo); 1641 else 1642 max_niter = max_stmt_executions_int (loop); 1643 1644 if (max_niter != -1 1645 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf) 1646 { 1647 if (dump_enabled_p ()) 1648 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1649 "not vectorized: iteration count smaller than " 1650 "vectorization factor.\n"); 1651 return 0; 1652 } 1653 } 1654 1655 int min_profitable_iters, min_profitable_estimate; 1656 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters, 1657 &min_profitable_estimate); 1658 1659 if (min_profitable_iters < 0) 1660 { 1661 if (dump_enabled_p ()) 1662 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1663 "not vectorized: vectorization not profitable.\n"); 1664 if (dump_enabled_p ()) 1665 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1666 "not vectorized: vector version will never be " 1667 "profitable.\n"); 1668 return -1; 1669 } 1670 1671 int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND) 1672 * assumed_vf); 1673 1674 /* Use the cost model only if it is more conservative than user specified 1675 threshold. */ 1676 unsigned int th = (unsigned) MAX (min_scalar_loop_bound, 1677 min_profitable_iters); 1678 1679 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th; 1680 1681 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) 1682 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th) 1683 { 1684 if (dump_enabled_p ()) 1685 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1686 "not vectorized: vectorization not profitable.\n"); 1687 if (dump_enabled_p ()) 1688 dump_printf_loc (MSG_NOTE, vect_location, 1689 "not vectorized: iteration count smaller than user " 1690 "specified loop bound parameter or minimum profitable " 1691 "iterations (whichever is more conservative).\n"); 1692 return 0; 1693 } 1694 1695 HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop); 1696 if (estimated_niter == -1) 1697 estimated_niter = likely_max_stmt_executions_int (loop); 1698 if (estimated_niter != -1 1699 && ((unsigned HOST_WIDE_INT) estimated_niter 1700 < MAX (th, (unsigned) min_profitable_estimate))) 1701 { 1702 if (dump_enabled_p ()) 1703 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1704 "not vectorized: estimated iteration count too " 1705 "small.\n"); 1706 if (dump_enabled_p ()) 1707 dump_printf_loc (MSG_NOTE, vect_location, 1708 "not vectorized: estimated iteration count smaller " 1709 "than specified loop bound parameter or minimum " 1710 "profitable iterations (whichever is more " 1711 "conservative).\n"); 1712 return -1; 1713 } 1714 1715 return 1; 1716 } 1717 1718 static opt_result 1719 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs, 1720 vec<data_reference_p> *datarefs, 1721 unsigned int *n_stmts) 1722 { 1723 *n_stmts = 0; 1724 for (unsigned i = 0; i < loop->num_nodes; i++) 1725 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]); 1726 !gsi_end_p (gsi); gsi_next (&gsi)) 1727 { 1728 gimple *stmt = gsi_stmt (gsi); 1729 if (is_gimple_debug (stmt)) 1730 continue; 1731 ++(*n_stmts); 1732 opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs); 1733 if (!res) 1734 { 1735 if (is_gimple_call (stmt) && loop->safelen) 1736 { 1737 tree fndecl = gimple_call_fndecl (stmt), op; 1738 if (fndecl != NULL_TREE) 1739 { 1740 cgraph_node *node = cgraph_node::get (fndecl); 1741 if (node != NULL && node->simd_clones != NULL) 1742 { 1743 unsigned int j, n = gimple_call_num_args (stmt); 1744 for (j = 0; j < n; j++) 1745 { 1746 op = gimple_call_arg (stmt, j); 1747 if (DECL_P (op) 1748 || (REFERENCE_CLASS_P (op) 1749 && get_base_address (op))) 1750 break; 1751 } 1752 op = gimple_call_lhs (stmt); 1753 /* Ignore #pragma omp declare simd functions 1754 if they don't have data references in the 1755 call stmt itself. */ 1756 if (j == n 1757 && !(op 1758 && (DECL_P (op) 1759 || (REFERENCE_CLASS_P (op) 1760 && get_base_address (op))))) 1761 continue; 1762 } 1763 } 1764 } 1765 return res; 1766 } 1767 /* If dependence analysis will give up due to the limit on the 1768 number of datarefs stop here and fail fatally. */ 1769 if (datarefs->length () 1770 > (unsigned)PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS)) 1771 return opt_result::failure_at (stmt, "exceeded param " 1772 "loop-max-datarefs-for-datadeps\n"); 1773 } 1774 return opt_result::success (); 1775 } 1776 1777 /* Function vect_analyze_loop_2. 1778 1779 Apply a set of analyses on LOOP, and create a loop_vec_info struct 1780 for it. The different analyses will record information in the 1781 loop_vec_info struct. */ 1782 static opt_result 1783 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts) 1784 { 1785 opt_result ok = opt_result::success (); 1786 int res; 1787 unsigned int max_vf = MAX_VECTORIZATION_FACTOR; 1788 poly_uint64 min_vf = 2; 1789 1790 /* The first group of checks is independent of the vector size. */ 1791 fatal = true; 1792 1793 if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo) 1794 && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo))) 1795 return opt_result::failure_at (vect_location, 1796 "not vectorized: simd if(0)\n"); 1797 1798 /* Find all data references in the loop (which correspond to vdefs/vuses) 1799 and analyze their evolution in the loop. */ 1800 1801 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo); 1802 1803 /* Gather the data references and count stmts in the loop. */ 1804 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ()) 1805 { 1806 opt_result res 1807 = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo), 1808 &LOOP_VINFO_DATAREFS (loop_vinfo), 1809 n_stmts); 1810 if (!res) 1811 { 1812 if (dump_enabled_p ()) 1813 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1814 "not vectorized: loop contains function " 1815 "calls or data references that cannot " 1816 "be analyzed\n"); 1817 return res; 1818 } 1819 loop_vinfo->shared->save_datarefs (); 1820 } 1821 else 1822 loop_vinfo->shared->check_datarefs (); 1823 1824 /* Analyze the data references and also adjust the minimal 1825 vectorization factor according to the loads and stores. */ 1826 1827 ok = vect_analyze_data_refs (loop_vinfo, &min_vf); 1828 if (!ok) 1829 { 1830 if (dump_enabled_p ()) 1831 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1832 "bad data references.\n"); 1833 return ok; 1834 } 1835 1836 /* Classify all cross-iteration scalar data-flow cycles. 1837 Cross-iteration cycles caused by virtual phis are analyzed separately. */ 1838 vect_analyze_scalar_cycles (loop_vinfo); 1839 1840 vect_pattern_recog (loop_vinfo); 1841 1842 vect_fixup_scalar_cycles_with_patterns (loop_vinfo); 1843 1844 /* Analyze the access patterns of the data-refs in the loop (consecutive, 1845 complex, etc.). FORNOW: Only handle consecutive access pattern. */ 1846 1847 ok = vect_analyze_data_ref_accesses (loop_vinfo); 1848 if (!ok) 1849 { 1850 if (dump_enabled_p ()) 1851 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1852 "bad data access.\n"); 1853 return ok; 1854 } 1855 1856 /* Data-flow analysis to detect stmts that do not need to be vectorized. */ 1857 1858 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo); 1859 if (!ok) 1860 { 1861 if (dump_enabled_p ()) 1862 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1863 "unexpected pattern.\n"); 1864 return ok; 1865 } 1866 1867 /* While the rest of the analysis below depends on it in some way. */ 1868 fatal = false; 1869 1870 /* Analyze data dependences between the data-refs in the loop 1871 and adjust the maximum vectorization factor according to 1872 the dependences. 1873 FORNOW: fail at the first data dependence that we encounter. */ 1874 1875 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf); 1876 if (!ok) 1877 { 1878 if (dump_enabled_p ()) 1879 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1880 "bad data dependence.\n"); 1881 return ok; 1882 } 1883 if (max_vf != MAX_VECTORIZATION_FACTOR 1884 && maybe_lt (max_vf, min_vf)) 1885 return opt_result::failure_at (vect_location, "bad data dependence.\n"); 1886 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf; 1887 1888 ok = vect_determine_vectorization_factor (loop_vinfo); 1889 if (!ok) 1890 { 1891 if (dump_enabled_p ()) 1892 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1893 "can't determine vectorization factor.\n"); 1894 return ok; 1895 } 1896 if (max_vf != MAX_VECTORIZATION_FACTOR 1897 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo))) 1898 return opt_result::failure_at (vect_location, "bad data dependence.\n"); 1899 1900 /* Compute the scalar iteration cost. */ 1901 vect_compute_single_scalar_iteration_cost (loop_vinfo); 1902 1903 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 1904 unsigned th; 1905 1906 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */ 1907 ok = vect_analyze_slp (loop_vinfo, *n_stmts); 1908 if (!ok) 1909 return ok; 1910 1911 /* If there are any SLP instances mark them as pure_slp. */ 1912 bool slp = vect_make_slp_decision (loop_vinfo); 1913 if (slp) 1914 { 1915 /* Find stmts that need to be both vectorized and SLPed. */ 1916 vect_detect_hybrid_slp (loop_vinfo); 1917 1918 /* Update the vectorization factor based on the SLP decision. */ 1919 vect_update_vf_for_slp (loop_vinfo); 1920 } 1921 1922 bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo); 1923 1924 /* We don't expect to have to roll back to anything other than an empty 1925 set of rgroups. */ 1926 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ()); 1927 1928 /* This is the point where we can re-start analysis with SLP forced off. */ 1929 start_over: 1930 1931 /* Now the vectorization factor is final. */ 1932 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 1933 gcc_assert (known_ne (vectorization_factor, 0U)); 1934 1935 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ()) 1936 { 1937 dump_printf_loc (MSG_NOTE, vect_location, 1938 "vectorization_factor = "); 1939 dump_dec (MSG_NOTE, vectorization_factor); 1940 dump_printf (MSG_NOTE, ", niters = %wd\n", 1941 LOOP_VINFO_INT_NITERS (loop_vinfo)); 1942 } 1943 1944 HOST_WIDE_INT max_niter 1945 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo)); 1946 1947 /* Analyze the alignment of the data-refs in the loop. 1948 Fail if a data reference is found that cannot be vectorized. */ 1949 1950 ok = vect_analyze_data_refs_alignment (loop_vinfo); 1951 if (!ok) 1952 { 1953 if (dump_enabled_p ()) 1954 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1955 "bad data alignment.\n"); 1956 return ok; 1957 } 1958 1959 /* Prune the list of ddrs to be tested at run-time by versioning for alias. 1960 It is important to call pruning after vect_analyze_data_ref_accesses, 1961 since we use grouping information gathered by interleaving analysis. */ 1962 ok = vect_prune_runtime_alias_test_list (loop_vinfo); 1963 if (!ok) 1964 return ok; 1965 1966 /* Do not invoke vect_enhance_data_refs_alignment for epilogue 1967 vectorization, since we do not want to add extra peeling or 1968 add versioning for alignment. */ 1969 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)) 1970 /* This pass will decide on using loop versioning and/or loop peeling in 1971 order to enhance the alignment of data references in the loop. */ 1972 ok = vect_enhance_data_refs_alignment (loop_vinfo); 1973 else 1974 ok = vect_verify_datarefs_alignment (loop_vinfo); 1975 if (!ok) 1976 return ok; 1977 1978 if (slp) 1979 { 1980 /* Analyze operations in the SLP instances. Note this may 1981 remove unsupported SLP instances which makes the above 1982 SLP kind detection invalid. */ 1983 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length (); 1984 vect_slp_analyze_operations (loop_vinfo); 1985 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size) 1986 { 1987 ok = opt_result::failure_at (vect_location, 1988 "unsupported SLP instances\n"); 1989 goto again; 1990 } 1991 } 1992 1993 /* Scan all the remaining operations in the loop that are not subject 1994 to SLP and make sure they are vectorizable. */ 1995 ok = vect_analyze_loop_operations (loop_vinfo); 1996 if (!ok) 1997 { 1998 if (dump_enabled_p ()) 1999 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2000 "bad operation or unsupported loop bound.\n"); 2001 return ok; 2002 } 2003 2004 /* Decide whether to use a fully-masked loop for this vectorization 2005 factor. */ 2006 LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) 2007 = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) 2008 && vect_verify_full_masking (loop_vinfo)); 2009 if (dump_enabled_p ()) 2010 { 2011 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) 2012 dump_printf_loc (MSG_NOTE, vect_location, 2013 "using a fully-masked loop.\n"); 2014 else 2015 dump_printf_loc (MSG_NOTE, vect_location, 2016 "not using a fully-masked loop.\n"); 2017 } 2018 2019 /* If epilog loop is required because of data accesses with gaps, 2020 one additional iteration needs to be peeled. Check if there is 2021 enough iterations for vectorization. */ 2022 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) 2023 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) 2024 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) 2025 { 2026 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 2027 tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo); 2028 2029 if (known_lt (wi::to_widest (scalar_niters), vf)) 2030 return opt_result::failure_at (vect_location, 2031 "loop has no enough iterations to" 2032 " support peeling for gaps.\n"); 2033 } 2034 2035 /* Check the costings of the loop make vectorizing worthwhile. */ 2036 res = vect_analyze_loop_costing (loop_vinfo); 2037 if (res < 0) 2038 { 2039 ok = opt_result::failure_at (vect_location, 2040 "Loop costings may not be worthwhile.\n"); 2041 goto again; 2042 } 2043 if (!res) 2044 return opt_result::failure_at (vect_location, 2045 "Loop costings not worthwhile.\n"); 2046 2047 /* Decide whether we need to create an epilogue loop to handle 2048 remaining scalar iterations. */ 2049 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo); 2050 2051 unsigned HOST_WIDE_INT const_vf; 2052 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) 2053 /* The main loop handles all iterations. */ 2054 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false; 2055 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) 2056 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0) 2057 { 2058 /* Work out the (constant) number of iterations that need to be 2059 peeled for reasons other than niters. */ 2060 unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); 2061 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)) 2062 peel_niter += 1; 2063 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter, 2064 LOOP_VINFO_VECT_FACTOR (loop_vinfo))) 2065 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true; 2066 } 2067 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) 2068 /* ??? When peeling for gaps but not alignment, we could 2069 try to check whether the (variable) niters is known to be 2070 VF * N + 1. That's something of a niche case though. */ 2071 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) 2072 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf) 2073 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo)) 2074 < (unsigned) exact_log2 (const_vf)) 2075 /* In case of versioning, check if the maximum number of 2076 iterations is greater than th. If they are identical, 2077 the epilogue is unnecessary. */ 2078 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo) 2079 || ((unsigned HOST_WIDE_INT) max_niter 2080 > (th / const_vf) * const_vf)))) 2081 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true; 2082 2083 /* If an epilogue loop is required make sure we can create one. */ 2084 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) 2085 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)) 2086 { 2087 if (dump_enabled_p ()) 2088 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n"); 2089 if (!vect_can_advance_ivs_p (loop_vinfo) 2090 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo), 2091 single_exit (LOOP_VINFO_LOOP 2092 (loop_vinfo)))) 2093 { 2094 ok = opt_result::failure_at (vect_location, 2095 "not vectorized: can't create required " 2096 "epilog loop\n"); 2097 goto again; 2098 } 2099 } 2100 2101 /* During peeling, we need to check if number of loop iterations is 2102 enough for both peeled prolog loop and vector loop. This check 2103 can be merged along with threshold check of loop versioning, so 2104 increase threshold for this case if necessary. */ 2105 if (LOOP_REQUIRES_VERSIONING (loop_vinfo)) 2106 { 2107 poly_uint64 niters_th = 0; 2108 2109 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo)) 2110 { 2111 /* Niters for peeled prolog loop. */ 2112 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0) 2113 { 2114 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo); 2115 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt); 2116 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1; 2117 } 2118 else 2119 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); 2120 } 2121 2122 /* Niters for at least one iteration of vectorized loop. */ 2123 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) 2124 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo); 2125 /* One additional iteration because of peeling for gap. */ 2126 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)) 2127 niters_th += 1; 2128 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th; 2129 } 2130 2131 gcc_assert (known_eq (vectorization_factor, 2132 LOOP_VINFO_VECT_FACTOR (loop_vinfo))); 2133 2134 /* Ok to vectorize! */ 2135 return opt_result::success (); 2136 2137 again: 2138 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */ 2139 gcc_assert (!ok); 2140 2141 /* Try again with SLP forced off but if we didn't do any SLP there is 2142 no point in re-trying. */ 2143 if (!slp) 2144 return ok; 2145 2146 /* If there are reduction chains re-trying will fail anyway. */ 2147 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ()) 2148 return ok; 2149 2150 /* Likewise if the grouped loads or stores in the SLP cannot be handled 2151 via interleaving or lane instructions. */ 2152 slp_instance instance; 2153 slp_tree node; 2154 unsigned i, j; 2155 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance) 2156 { 2157 stmt_vec_info vinfo; 2158 vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]; 2159 if (! STMT_VINFO_GROUPED_ACCESS (vinfo)) 2160 continue; 2161 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo); 2162 unsigned int size = DR_GROUP_SIZE (vinfo); 2163 tree vectype = STMT_VINFO_VECTYPE (vinfo); 2164 if (! vect_store_lanes_supported (vectype, size, false) 2165 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U) 2166 && ! vect_grouped_store_supported (vectype, size)) 2167 return opt_result::failure_at (vinfo->stmt, 2168 "unsupported grouped store\n"); 2169 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node) 2170 { 2171 vinfo = SLP_TREE_SCALAR_STMTS (node)[0]; 2172 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo); 2173 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo); 2174 size = DR_GROUP_SIZE (vinfo); 2175 vectype = STMT_VINFO_VECTYPE (vinfo); 2176 if (! vect_load_lanes_supported (vectype, size, false) 2177 && ! vect_grouped_load_supported (vectype, single_element_p, 2178 size)) 2179 return opt_result::failure_at (vinfo->stmt, 2180 "unsupported grouped load\n"); 2181 } 2182 } 2183 2184 if (dump_enabled_p ()) 2185 dump_printf_loc (MSG_NOTE, vect_location, 2186 "re-trying with SLP disabled\n"); 2187 2188 /* Roll back state appropriately. No SLP this time. */ 2189 slp = false; 2190 /* Restore vectorization factor as it were without SLP. */ 2191 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor; 2192 /* Free the SLP instances. */ 2193 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance) 2194 vect_free_slp_instance (instance, false); 2195 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release (); 2196 /* Reset SLP type to loop_vect on all stmts. */ 2197 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i) 2198 { 2199 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i]; 2200 for (gimple_stmt_iterator si = gsi_start_phis (bb); 2201 !gsi_end_p (si); gsi_next (&si)) 2202 { 2203 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si)); 2204 STMT_SLP_TYPE (stmt_info) = loop_vect; 2205 } 2206 for (gimple_stmt_iterator si = gsi_start_bb (bb); 2207 !gsi_end_p (si); gsi_next (&si)) 2208 { 2209 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si)); 2210 STMT_SLP_TYPE (stmt_info) = loop_vect; 2211 if (STMT_VINFO_IN_PATTERN_P (stmt_info)) 2212 { 2213 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info); 2214 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info); 2215 STMT_SLP_TYPE (stmt_info) = loop_vect; 2216 for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq); 2217 !gsi_end_p (pi); gsi_next (&pi)) 2218 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi))) 2219 = loop_vect; 2220 } 2221 } 2222 } 2223 /* Free optimized alias test DDRS. */ 2224 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0); 2225 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release (); 2226 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release (); 2227 /* Reset target cost data. */ 2228 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)); 2229 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo) 2230 = init_cost (LOOP_VINFO_LOOP (loop_vinfo)); 2231 /* Reset accumulated rgroup information. */ 2232 release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo)); 2233 /* Reset assorted flags. */ 2234 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false; 2235 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false; 2236 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0; 2237 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0; 2238 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p; 2239 2240 goto start_over; 2241 } 2242 2243 /* Function vect_analyze_loop. 2244 2245 Apply a set of analyses on LOOP, and create a loop_vec_info struct 2246 for it. The different analyses will record information in the 2247 loop_vec_info struct. If ORIG_LOOP_VINFO is not NULL epilogue must 2248 be vectorized. */ 2249 opt_loop_vec_info 2250 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo, 2251 vec_info_shared *shared) 2252 { 2253 auto_vector_sizes vector_sizes; 2254 2255 /* Autodetect first vector size we try. */ 2256 current_vector_size = 0; 2257 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes); 2258 unsigned int next_size = 0; 2259 2260 DUMP_VECT_SCOPE ("analyze_loop_nest"); 2261 2262 if (loop_outer (loop) 2263 && loop_vec_info_for_loop (loop_outer (loop)) 2264 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop)))) 2265 return opt_loop_vec_info::failure_at (vect_location, 2266 "outer-loop already vectorized.\n"); 2267 2268 if (!find_loop_nest (loop, &shared->loop_nest)) 2269 return opt_loop_vec_info::failure_at 2270 (vect_location, 2271 "not vectorized: loop nest containing two or more consecutive inner" 2272 " loops cannot be vectorized\n"); 2273 2274 unsigned n_stmts = 0; 2275 poly_uint64 autodetected_vector_size = 0; 2276 while (1) 2277 { 2278 /* Check the CFG characteristics of the loop (nesting, entry/exit). */ 2279 opt_loop_vec_info loop_vinfo 2280 = vect_analyze_loop_form (loop, shared); 2281 if (!loop_vinfo) 2282 { 2283 if (dump_enabled_p ()) 2284 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2285 "bad loop form.\n"); 2286 return loop_vinfo; 2287 } 2288 2289 bool fatal = false; 2290 2291 if (orig_loop_vinfo) 2292 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo; 2293 2294 opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts); 2295 if (res) 2296 { 2297 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1; 2298 2299 return loop_vinfo; 2300 } 2301 2302 delete loop_vinfo; 2303 2304 if (next_size == 0) 2305 autodetected_vector_size = current_vector_size; 2306 2307 if (next_size < vector_sizes.length () 2308 && known_eq (vector_sizes[next_size], autodetected_vector_size)) 2309 next_size += 1; 2310 2311 if (fatal 2312 || next_size == vector_sizes.length () 2313 || known_eq (current_vector_size, 0U)) 2314 return opt_loop_vec_info::propagate_failure (res); 2315 2316 /* Try the next biggest vector size. */ 2317 current_vector_size = vector_sizes[next_size++]; 2318 if (dump_enabled_p ()) 2319 { 2320 dump_printf_loc (MSG_NOTE, vect_location, 2321 "***** Re-trying analysis with " 2322 "vector size "); 2323 dump_dec (MSG_NOTE, current_vector_size); 2324 dump_printf (MSG_NOTE, "\n"); 2325 } 2326 } 2327 } 2328 2329 /* Return true if there is an in-order reduction function for CODE, storing 2330 it in *REDUC_FN if so. */ 2331 2332 static bool 2333 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn) 2334 { 2335 switch (code) 2336 { 2337 case PLUS_EXPR: 2338 *reduc_fn = IFN_FOLD_LEFT_PLUS; 2339 return true; 2340 2341 default: 2342 return false; 2343 } 2344 } 2345 2346 /* Function reduction_fn_for_scalar_code 2347 2348 Input: 2349 CODE - tree_code of a reduction operations. 2350 2351 Output: 2352 REDUC_FN - the corresponding internal function to be used to reduce the 2353 vector of partial results into a single scalar result, or IFN_LAST 2354 if the operation is a supported reduction operation, but does not have 2355 such an internal function. 2356 2357 Return FALSE if CODE currently cannot be vectorized as reduction. */ 2358 2359 static bool 2360 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn) 2361 { 2362 switch (code) 2363 { 2364 case MAX_EXPR: 2365 *reduc_fn = IFN_REDUC_MAX; 2366 return true; 2367 2368 case MIN_EXPR: 2369 *reduc_fn = IFN_REDUC_MIN; 2370 return true; 2371 2372 case PLUS_EXPR: 2373 *reduc_fn = IFN_REDUC_PLUS; 2374 return true; 2375 2376 case BIT_AND_EXPR: 2377 *reduc_fn = IFN_REDUC_AND; 2378 return true; 2379 2380 case BIT_IOR_EXPR: 2381 *reduc_fn = IFN_REDUC_IOR; 2382 return true; 2383 2384 case BIT_XOR_EXPR: 2385 *reduc_fn = IFN_REDUC_XOR; 2386 return true; 2387 2388 case MULT_EXPR: 2389 case MINUS_EXPR: 2390 *reduc_fn = IFN_LAST; 2391 return true; 2392 2393 default: 2394 return false; 2395 } 2396 } 2397 2398 /* If there is a neutral value X such that SLP reduction NODE would not 2399 be affected by the introduction of additional X elements, return that X, 2400 otherwise return null. CODE is the code of the reduction. REDUC_CHAIN 2401 is true if the SLP statements perform a single reduction, false if each 2402 statement performs an independent reduction. */ 2403 2404 static tree 2405 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code, 2406 bool reduc_chain) 2407 { 2408 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node); 2409 stmt_vec_info stmt_vinfo = stmts[0]; 2410 tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo); 2411 tree scalar_type = TREE_TYPE (vector_type); 2412 struct loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father; 2413 gcc_assert (loop); 2414 2415 switch (code) 2416 { 2417 case WIDEN_SUM_EXPR: 2418 case DOT_PROD_EXPR: 2419 case SAD_EXPR: 2420 case PLUS_EXPR: 2421 case MINUS_EXPR: 2422 case BIT_IOR_EXPR: 2423 case BIT_XOR_EXPR: 2424 return build_zero_cst (scalar_type); 2425 2426 case MULT_EXPR: 2427 return build_one_cst (scalar_type); 2428 2429 case BIT_AND_EXPR: 2430 return build_all_ones_cst (scalar_type); 2431 2432 case MAX_EXPR: 2433 case MIN_EXPR: 2434 /* For MIN/MAX the initial values are neutral. A reduction chain 2435 has only a single initial value, so that value is neutral for 2436 all statements. */ 2437 if (reduc_chain) 2438 return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, 2439 loop_preheader_edge (loop)); 2440 return NULL_TREE; 2441 2442 default: 2443 return NULL_TREE; 2444 } 2445 } 2446 2447 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement 2448 STMT is printed with a message MSG. */ 2449 2450 static void 2451 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg) 2452 { 2453 dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt); 2454 } 2455 2456 /* DEF_STMT_INFO occurs in a loop that contains a potential reduction 2457 operation. Return true if the results of DEF_STMT_INFO are something 2458 that can be accumulated by such a reduction. */ 2459 2460 static bool 2461 vect_valid_reduction_input_p (stmt_vec_info def_stmt_info) 2462 { 2463 return (is_gimple_assign (def_stmt_info->stmt) 2464 || is_gimple_call (def_stmt_info->stmt) 2465 || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_induction_def 2466 || (gimple_code (def_stmt_info->stmt) == GIMPLE_PHI 2467 && STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_internal_def 2468 && !is_loop_header_bb_p (gimple_bb (def_stmt_info->stmt)))); 2469 } 2470 2471 /* Detect SLP reduction of the form: 2472 2473 #a1 = phi <a5, a0> 2474 a2 = operation (a1) 2475 a3 = operation (a2) 2476 a4 = operation (a3) 2477 a5 = operation (a4) 2478 2479 #a = phi <a5> 2480 2481 PHI is the reduction phi node (#a1 = phi <a5, a0> above) 2482 FIRST_STMT is the first reduction stmt in the chain 2483 (a2 = operation (a1)). 2484 2485 Return TRUE if a reduction chain was detected. */ 2486 2487 static bool 2488 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi, 2489 gimple *first_stmt) 2490 { 2491 struct loop *loop = (gimple_bb (phi))->loop_father; 2492 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info); 2493 enum tree_code code; 2494 gimple *loop_use_stmt = NULL; 2495 stmt_vec_info use_stmt_info; 2496 tree lhs; 2497 imm_use_iterator imm_iter; 2498 use_operand_p use_p; 2499 int nloop_uses, size = 0, n_out_of_loop_uses; 2500 bool found = false; 2501 2502 if (loop != vect_loop) 2503 return false; 2504 2505 auto_vec<stmt_vec_info, 8> reduc_chain; 2506 lhs = PHI_RESULT (phi); 2507 code = gimple_assign_rhs_code (first_stmt); 2508 while (1) 2509 { 2510 nloop_uses = 0; 2511 n_out_of_loop_uses = 0; 2512 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs) 2513 { 2514 gimple *use_stmt = USE_STMT (use_p); 2515 if (is_gimple_debug (use_stmt)) 2516 continue; 2517 2518 /* Check if we got back to the reduction phi. */ 2519 if (use_stmt == phi) 2520 { 2521 loop_use_stmt = use_stmt; 2522 found = true; 2523 break; 2524 } 2525 2526 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))) 2527 { 2528 loop_use_stmt = use_stmt; 2529 nloop_uses++; 2530 } 2531 else 2532 n_out_of_loop_uses++; 2533 2534 /* There are can be either a single use in the loop or two uses in 2535 phi nodes. */ 2536 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses)) 2537 return false; 2538 } 2539 2540 if (found) 2541 break; 2542 2543 /* We reached a statement with no loop uses. */ 2544 if (nloop_uses == 0) 2545 return false; 2546 2547 /* This is a loop exit phi, and we haven't reached the reduction phi. */ 2548 if (gimple_code (loop_use_stmt) == GIMPLE_PHI) 2549 return false; 2550 2551 if (!is_gimple_assign (loop_use_stmt) 2552 || code != gimple_assign_rhs_code (loop_use_stmt) 2553 || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt))) 2554 return false; 2555 2556 /* Insert USE_STMT into reduction chain. */ 2557 use_stmt_info = loop_info->lookup_stmt (loop_use_stmt); 2558 reduc_chain.safe_push (use_stmt_info); 2559 2560 lhs = gimple_assign_lhs (loop_use_stmt); 2561 size++; 2562 } 2563 2564 if (!found || loop_use_stmt != phi || size < 2) 2565 return false; 2566 2567 /* Swap the operands, if needed, to make the reduction operand be the second 2568 operand. */ 2569 lhs = PHI_RESULT (phi); 2570 for (unsigned i = 0; i < reduc_chain.length (); ++i) 2571 { 2572 gassign *next_stmt = as_a <gassign *> (reduc_chain[i]->stmt); 2573 if (gimple_assign_rhs2 (next_stmt) == lhs) 2574 { 2575 tree op = gimple_assign_rhs1 (next_stmt); 2576 stmt_vec_info def_stmt_info = loop_info->lookup_def (op); 2577 2578 /* Check that the other def is either defined in the loop 2579 ("vect_internal_def"), or it's an induction (defined by a 2580 loop-header phi-node). */ 2581 if (def_stmt_info 2582 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)) 2583 && vect_valid_reduction_input_p (def_stmt_info)) 2584 { 2585 lhs = gimple_assign_lhs (next_stmt); 2586 continue; 2587 } 2588 2589 return false; 2590 } 2591 else 2592 { 2593 tree op = gimple_assign_rhs2 (next_stmt); 2594 stmt_vec_info def_stmt_info = loop_info->lookup_def (op); 2595 2596 /* Check that the other def is either defined in the loop 2597 ("vect_internal_def"), or it's an induction (defined by a 2598 loop-header phi-node). */ 2599 if (def_stmt_info 2600 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)) 2601 && vect_valid_reduction_input_p (def_stmt_info)) 2602 { 2603 if (dump_enabled_p ()) 2604 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: %G", 2605 next_stmt); 2606 2607 swap_ssa_operands (next_stmt, 2608 gimple_assign_rhs1_ptr (next_stmt), 2609 gimple_assign_rhs2_ptr (next_stmt)); 2610 update_stmt (next_stmt); 2611 2612 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt))) 2613 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true; 2614 } 2615 else 2616 return false; 2617 } 2618 2619 lhs = gimple_assign_lhs (next_stmt); 2620 } 2621 2622 /* Build up the actual chain. */ 2623 for (unsigned i = 0; i < reduc_chain.length () - 1; ++i) 2624 { 2625 REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0]; 2626 REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1]; 2627 } 2628 REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0]; 2629 REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL; 2630 2631 /* Save the chain for further analysis in SLP detection. */ 2632 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]); 2633 REDUC_GROUP_SIZE (reduc_chain[0]) = size; 2634 2635 return true; 2636 } 2637 2638 /* Return true if we need an in-order reduction for operation CODE 2639 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer 2640 overflow must wrap. */ 2641 2642 static bool 2643 needs_fold_left_reduction_p (tree type, tree_code code, 2644 bool need_wrapping_integral_overflow) 2645 { 2646 /* CHECKME: check for !flag_finite_math_only too? */ 2647 if (SCALAR_FLOAT_TYPE_P (type)) 2648 switch (code) 2649 { 2650 case MIN_EXPR: 2651 case MAX_EXPR: 2652 return false; 2653 2654 default: 2655 return !flag_associative_math; 2656 } 2657 2658 if (INTEGRAL_TYPE_P (type)) 2659 { 2660 if (!operation_no_trapping_overflow (type, code)) 2661 return true; 2662 if (need_wrapping_integral_overflow 2663 && !TYPE_OVERFLOW_WRAPS (type) 2664 && operation_can_overflow (code)) 2665 return true; 2666 return false; 2667 } 2668 2669 if (SAT_FIXED_POINT_TYPE_P (type)) 2670 return true; 2671 2672 return false; 2673 } 2674 2675 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and 2676 reduction operation CODE has a handled computation expression. */ 2677 2678 bool 2679 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi, 2680 tree loop_arg, enum tree_code code) 2681 { 2682 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path; 2683 auto_bitmap visited; 2684 tree lookfor = PHI_RESULT (phi); 2685 ssa_op_iter curri; 2686 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE); 2687 while (USE_FROM_PTR (curr) != loop_arg) 2688 curr = op_iter_next_use (&curri); 2689 curri.i = curri.numops; 2690 do 2691 { 2692 path.safe_push (std::make_pair (curri, curr)); 2693 tree use = USE_FROM_PTR (curr); 2694 if (use == lookfor) 2695 break; 2696 gimple *def = SSA_NAME_DEF_STMT (use); 2697 if (gimple_nop_p (def) 2698 || ! flow_bb_inside_loop_p (loop, gimple_bb (def))) 2699 { 2700 pop: 2701 do 2702 { 2703 std::pair<ssa_op_iter, use_operand_p> x = path.pop (); 2704 curri = x.first; 2705 curr = x.second; 2706 do 2707 curr = op_iter_next_use (&curri); 2708 /* Skip already visited or non-SSA operands (from iterating 2709 over PHI args). */ 2710 while (curr != NULL_USE_OPERAND_P 2711 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME 2712 || ! bitmap_set_bit (visited, 2713 SSA_NAME_VERSION 2714 (USE_FROM_PTR (curr))))); 2715 } 2716 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ()); 2717 if (curr == NULL_USE_OPERAND_P) 2718 break; 2719 } 2720 else 2721 { 2722 if (gimple_code (def) == GIMPLE_PHI) 2723 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE); 2724 else 2725 curr = op_iter_init_use (&curri, def, SSA_OP_USE); 2726 while (curr != NULL_USE_OPERAND_P 2727 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME 2728 || ! bitmap_set_bit (visited, 2729 SSA_NAME_VERSION 2730 (USE_FROM_PTR (curr))))) 2731 curr = op_iter_next_use (&curri); 2732 if (curr == NULL_USE_OPERAND_P) 2733 goto pop; 2734 } 2735 } 2736 while (1); 2737 if (dump_file && (dump_flags & TDF_DETAILS)) 2738 { 2739 dump_printf_loc (MSG_NOTE, loc, "reduction path: "); 2740 unsigned i; 2741 std::pair<ssa_op_iter, use_operand_p> *x; 2742 FOR_EACH_VEC_ELT (path, i, x) 2743 dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second)); 2744 dump_printf (MSG_NOTE, "\n"); 2745 } 2746 2747 /* Check whether the reduction path detected is valid. */ 2748 bool fail = path.length () == 0; 2749 bool neg = false; 2750 for (unsigned i = 1; i < path.length (); ++i) 2751 { 2752 gimple *use_stmt = USE_STMT (path[i].second); 2753 tree op = USE_FROM_PTR (path[i].second); 2754 if (! has_single_use (op) 2755 || ! is_gimple_assign (use_stmt)) 2756 { 2757 fail = true; 2758 break; 2759 } 2760 if (gimple_assign_rhs_code (use_stmt) != code) 2761 { 2762 if (code == PLUS_EXPR 2763 && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR) 2764 { 2765 /* Track whether we negate the reduction value each iteration. */ 2766 if (gimple_assign_rhs2 (use_stmt) == op) 2767 neg = ! neg; 2768 } 2769 else 2770 { 2771 fail = true; 2772 break; 2773 } 2774 } 2775 } 2776 return ! fail && ! neg; 2777 } 2778 2779 2780 /* Function vect_is_simple_reduction 2781 2782 (1) Detect a cross-iteration def-use cycle that represents a simple 2783 reduction computation. We look for the following pattern: 2784 2785 loop_header: 2786 a1 = phi < a0, a2 > 2787 a3 = ... 2788 a2 = operation (a3, a1) 2789 2790 or 2791 2792 a3 = ... 2793 loop_header: 2794 a1 = phi < a0, a2 > 2795 a2 = operation (a3, a1) 2796 2797 such that: 2798 1. operation is commutative and associative and it is safe to 2799 change the order of the computation 2800 2. no uses for a2 in the loop (a2 is used out of the loop) 2801 3. no uses of a1 in the loop besides the reduction operation 2802 4. no uses of a1 outside the loop. 2803 2804 Conditions 1,4 are tested here. 2805 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized. 2806 2807 (2) Detect a cross-iteration def-use cycle in nested loops, i.e., 2808 nested cycles. 2809 2810 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double 2811 reductions: 2812 2813 a1 = phi < a0, a2 > 2814 inner loop (def of a3) 2815 a2 = phi < a3 > 2816 2817 (4) Detect condition expressions, ie: 2818 for (int i = 0; i < N; i++) 2819 if (a[i] < val) 2820 ret_val = a[i]; 2821 2822 */ 2823 2824 static stmt_vec_info 2825 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info, 2826 bool *double_reduc, 2827 bool need_wrapping_integral_overflow, 2828 enum vect_reduction_type *v_reduc_type) 2829 { 2830 gphi *phi = as_a <gphi *> (phi_info->stmt); 2831 struct loop *loop = (gimple_bb (phi))->loop_father; 2832 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info); 2833 bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop); 2834 gimple *phi_use_stmt = NULL; 2835 enum tree_code orig_code, code; 2836 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE; 2837 tree type; 2838 tree name; 2839 imm_use_iterator imm_iter; 2840 use_operand_p use_p; 2841 bool phi_def; 2842 2843 *double_reduc = false; 2844 *v_reduc_type = TREE_CODE_REDUCTION; 2845 2846 tree phi_name = PHI_RESULT (phi); 2847 /* ??? If there are no uses of the PHI result the inner loop reduction 2848 won't be detected as possibly double-reduction by vectorizable_reduction 2849 because that tries to walk the PHI arg from the preheader edge which 2850 can be constant. See PR60382. */ 2851 if (has_zero_uses (phi_name)) 2852 return NULL; 2853 unsigned nphi_def_loop_uses = 0; 2854 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name) 2855 { 2856 gimple *use_stmt = USE_STMT (use_p); 2857 if (is_gimple_debug (use_stmt)) 2858 continue; 2859 2860 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))) 2861 { 2862 if (dump_enabled_p ()) 2863 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2864 "intermediate value used outside loop.\n"); 2865 2866 return NULL; 2867 } 2868 2869 nphi_def_loop_uses++; 2870 phi_use_stmt = use_stmt; 2871 } 2872 2873 edge latch_e = loop_latch_edge (loop); 2874 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e); 2875 if (TREE_CODE (loop_arg) != SSA_NAME) 2876 { 2877 if (dump_enabled_p ()) 2878 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2879 "reduction: not ssa_name: %T\n", loop_arg); 2880 return NULL; 2881 } 2882 2883 stmt_vec_info def_stmt_info = loop_info->lookup_def (loop_arg); 2884 if (!def_stmt_info 2885 || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))) 2886 return NULL; 2887 2888 if (gassign *def_stmt = dyn_cast <gassign *> (def_stmt_info->stmt)) 2889 { 2890 name = gimple_assign_lhs (def_stmt); 2891 phi_def = false; 2892 } 2893 else if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt)) 2894 { 2895 name = PHI_RESULT (def_stmt); 2896 phi_def = true; 2897 } 2898 else 2899 { 2900 if (dump_enabled_p ()) 2901 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2902 "reduction: unhandled reduction operation: %G", 2903 def_stmt_info->stmt); 2904 return NULL; 2905 } 2906 2907 unsigned nlatch_def_loop_uses = 0; 2908 auto_vec<gphi *, 3> lcphis; 2909 bool inner_loop_of_double_reduc = false; 2910 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name) 2911 { 2912 gimple *use_stmt = USE_STMT (use_p); 2913 if (is_gimple_debug (use_stmt)) 2914 continue; 2915 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))) 2916 nlatch_def_loop_uses++; 2917 else 2918 { 2919 /* We can have more than one loop-closed PHI. */ 2920 lcphis.safe_push (as_a <gphi *> (use_stmt)); 2921 if (nested_in_vect_loop 2922 && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt)) 2923 == vect_double_reduction_def)) 2924 inner_loop_of_double_reduc = true; 2925 } 2926 } 2927 2928 /* If this isn't a nested cycle or if the nested cycle reduction value 2929 is used ouside of the inner loop we cannot handle uses of the reduction 2930 value. */ 2931 if ((!nested_in_vect_loop || inner_loop_of_double_reduc) 2932 && (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)) 2933 { 2934 if (dump_enabled_p ()) 2935 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2936 "reduction used in loop.\n"); 2937 return NULL; 2938 } 2939 2940 /* If DEF_STMT is a phi node itself, we expect it to have a single argument 2941 defined in the inner loop. */ 2942 if (phi_def) 2943 { 2944 gphi *def_stmt = as_a <gphi *> (def_stmt_info->stmt); 2945 op1 = PHI_ARG_DEF (def_stmt, 0); 2946 2947 if (gimple_phi_num_args (def_stmt) != 1 2948 || TREE_CODE (op1) != SSA_NAME) 2949 { 2950 if (dump_enabled_p ()) 2951 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2952 "unsupported phi node definition.\n"); 2953 2954 return NULL; 2955 } 2956 2957 gimple *def1 = SSA_NAME_DEF_STMT (op1); 2958 if (gimple_bb (def1) 2959 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)) 2960 && loop->inner 2961 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1)) 2962 && is_gimple_assign (def1) 2963 && is_a <gphi *> (phi_use_stmt) 2964 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt))) 2965 { 2966 if (dump_enabled_p ()) 2967 report_vect_op (MSG_NOTE, def_stmt, 2968 "detected double reduction: "); 2969 2970 *double_reduc = true; 2971 return def_stmt_info; 2972 } 2973 2974 return NULL; 2975 } 2976 2977 /* If we are vectorizing an inner reduction we are executing that 2978 in the original order only in case we are not dealing with a 2979 double reduction. */ 2980 bool check_reduction = true; 2981 if (flow_loop_nested_p (vect_loop, loop)) 2982 { 2983 gphi *lcphi; 2984 unsigned i; 2985 check_reduction = false; 2986 FOR_EACH_VEC_ELT (lcphis, i, lcphi) 2987 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi)) 2988 { 2989 gimple *use_stmt = USE_STMT (use_p); 2990 if (is_gimple_debug (use_stmt)) 2991 continue; 2992 if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt))) 2993 check_reduction = true; 2994 } 2995 } 2996 2997 gassign *def_stmt = as_a <gassign *> (def_stmt_info->stmt); 2998 code = orig_code = gimple_assign_rhs_code (def_stmt); 2999 3000 if (nested_in_vect_loop && !check_reduction) 3001 { 3002 /* FIXME: Even for non-reductions code generation is funneled 3003 through vectorizable_reduction for the stmt defining the 3004 PHI latch value. So we have to artificially restrict ourselves 3005 for the supported operations. */ 3006 switch (get_gimple_rhs_class (code)) 3007 { 3008 case GIMPLE_BINARY_RHS: 3009 case GIMPLE_TERNARY_RHS: 3010 break; 3011 default: 3012 /* Not supported by vectorizable_reduction. */ 3013 if (dump_enabled_p ()) 3014 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt, 3015 "nested cycle: not handled operation: "); 3016 return NULL; 3017 } 3018 if (dump_enabled_p ()) 3019 report_vect_op (MSG_NOTE, def_stmt, "detected nested cycle: "); 3020 return def_stmt_info; 3021 } 3022 3023 /* We can handle "res -= x[i]", which is non-associative by 3024 simply rewriting this into "res += -x[i]". Avoid changing 3025 gimple instruction for the first simple tests and only do this 3026 if we're allowed to change code at all. */ 3027 if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name) 3028 code = PLUS_EXPR; 3029 3030 if (code == COND_EXPR) 3031 { 3032 if (! nested_in_vect_loop) 3033 *v_reduc_type = COND_REDUCTION; 3034 3035 op3 = gimple_assign_rhs1 (def_stmt); 3036 if (COMPARISON_CLASS_P (op3)) 3037 { 3038 op4 = TREE_OPERAND (op3, 1); 3039 op3 = TREE_OPERAND (op3, 0); 3040 } 3041 if (op3 == phi_name || op4 == phi_name) 3042 { 3043 if (dump_enabled_p ()) 3044 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt, 3045 "reduction: condition depends on previous" 3046 " iteration: "); 3047 return NULL; 3048 } 3049 3050 op1 = gimple_assign_rhs2 (def_stmt); 3051 op2 = gimple_assign_rhs3 (def_stmt); 3052 } 3053 else if (!commutative_tree_code (code) || !associative_tree_code (code)) 3054 { 3055 if (dump_enabled_p ()) 3056 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt, 3057 "reduction: not commutative/associative: "); 3058 return NULL; 3059 } 3060 else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS) 3061 { 3062 op1 = gimple_assign_rhs1 (def_stmt); 3063 op2 = gimple_assign_rhs2 (def_stmt); 3064 } 3065 else 3066 { 3067 if (dump_enabled_p ()) 3068 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt, 3069 "reduction: not handled operation: "); 3070 return NULL; 3071 } 3072 3073 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME) 3074 { 3075 if (dump_enabled_p ()) 3076 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt, 3077 "reduction: both uses not ssa_names: "); 3078 3079 return NULL; 3080 } 3081 3082 type = TREE_TYPE (gimple_assign_lhs (def_stmt)); 3083 if ((TREE_CODE (op1) == SSA_NAME 3084 && !types_compatible_p (type,TREE_TYPE (op1))) 3085 || (TREE_CODE (op2) == SSA_NAME 3086 && !types_compatible_p (type, TREE_TYPE (op2))) 3087 || (op3 && TREE_CODE (op3) == SSA_NAME 3088 && !types_compatible_p (type, TREE_TYPE (op3))) 3089 || (op4 && TREE_CODE (op4) == SSA_NAME 3090 && !types_compatible_p (type, TREE_TYPE (op4)))) 3091 { 3092 if (dump_enabled_p ()) 3093 { 3094 dump_printf_loc (MSG_NOTE, vect_location, 3095 "reduction: multiple types: operation type: " 3096 "%T, operands types: %T,%T", 3097 type, TREE_TYPE (op1), TREE_TYPE (op2)); 3098 if (op3) 3099 dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op3)); 3100 3101 if (op4) 3102 dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op4)); 3103 dump_printf (MSG_NOTE, "\n"); 3104 } 3105 3106 return NULL; 3107 } 3108 3109 /* Check whether it's ok to change the order of the computation. 3110 Generally, when vectorizing a reduction we change the order of the 3111 computation. This may change the behavior of the program in some 3112 cases, so we need to check that this is ok. One exception is when 3113 vectorizing an outer-loop: the inner-loop is executed sequentially, 3114 and therefore vectorizing reductions in the inner-loop during 3115 outer-loop vectorization is safe. */ 3116 if (check_reduction 3117 && *v_reduc_type == TREE_CODE_REDUCTION 3118 && needs_fold_left_reduction_p (type, code, 3119 need_wrapping_integral_overflow)) 3120 *v_reduc_type = FOLD_LEFT_REDUCTION; 3121 3122 /* Reduction is safe. We're dealing with one of the following: 3123 1) integer arithmetic and no trapv 3124 2) floating point arithmetic, and special flags permit this optimization 3125 3) nested cycle (i.e., outer loop vectorization). */ 3126 stmt_vec_info def1_info = loop_info->lookup_def (op1); 3127 stmt_vec_info def2_info = loop_info->lookup_def (op2); 3128 if (code != COND_EXPR && !def1_info && !def2_info) 3129 { 3130 if (dump_enabled_p ()) 3131 report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: "); 3132 return NULL; 3133 } 3134 3135 /* Check that one def is the reduction def, defined by PHI, 3136 the other def is either defined in the loop ("vect_internal_def"), 3137 or it's an induction (defined by a loop-header phi-node). */ 3138 3139 if (def2_info 3140 && def2_info->stmt == phi 3141 && (code == COND_EXPR 3142 || !def1_info 3143 || !flow_bb_inside_loop_p (loop, gimple_bb (def1_info->stmt)) 3144 || vect_valid_reduction_input_p (def1_info))) 3145 { 3146 if (dump_enabled_p ()) 3147 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: "); 3148 return def_stmt_info; 3149 } 3150 3151 if (def1_info 3152 && def1_info->stmt == phi 3153 && (code == COND_EXPR 3154 || !def2_info 3155 || !flow_bb_inside_loop_p (loop, gimple_bb (def2_info->stmt)) 3156 || vect_valid_reduction_input_p (def2_info))) 3157 { 3158 if (! nested_in_vect_loop && orig_code != MINUS_EXPR) 3159 { 3160 /* Check if we can swap operands (just for simplicity - so that 3161 the rest of the code can assume that the reduction variable 3162 is always the last (second) argument). */ 3163 if (code == COND_EXPR) 3164 { 3165 /* Swap cond_expr by inverting the condition. */ 3166 tree cond_expr = gimple_assign_rhs1 (def_stmt); 3167 enum tree_code invert_code = ERROR_MARK; 3168 enum tree_code cond_code = TREE_CODE (cond_expr); 3169 3170 if (TREE_CODE_CLASS (cond_code) == tcc_comparison) 3171 { 3172 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0)); 3173 invert_code = invert_tree_comparison (cond_code, honor_nans); 3174 } 3175 if (invert_code != ERROR_MARK) 3176 { 3177 TREE_SET_CODE (cond_expr, invert_code); 3178 swap_ssa_operands (def_stmt, 3179 gimple_assign_rhs2_ptr (def_stmt), 3180 gimple_assign_rhs3_ptr (def_stmt)); 3181 } 3182 else 3183 { 3184 if (dump_enabled_p ()) 3185 report_vect_op (MSG_NOTE, def_stmt, 3186 "detected reduction: cannot swap operands " 3187 "for cond_expr"); 3188 return NULL; 3189 } 3190 } 3191 else 3192 swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt), 3193 gimple_assign_rhs2_ptr (def_stmt)); 3194 3195 if (dump_enabled_p ()) 3196 report_vect_op (MSG_NOTE, def_stmt, 3197 "detected reduction: need to swap operands: "); 3198 3199 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt))) 3200 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true; 3201 } 3202 else 3203 { 3204 if (dump_enabled_p ()) 3205 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: "); 3206 } 3207 3208 return def_stmt_info; 3209 } 3210 3211 /* Try to find SLP reduction chain. */ 3212 if (! nested_in_vect_loop 3213 && code != COND_EXPR 3214 && orig_code != MINUS_EXPR 3215 && vect_is_slp_reduction (loop_info, phi, def_stmt)) 3216 { 3217 if (dump_enabled_p ()) 3218 report_vect_op (MSG_NOTE, def_stmt, 3219 "reduction: detected reduction chain: "); 3220 3221 return def_stmt_info; 3222 } 3223 3224 /* Look for the expression computing loop_arg from loop PHI result. */ 3225 if (check_reduction_path (vect_location, loop, phi, loop_arg, code)) 3226 return def_stmt_info; 3227 3228 if (dump_enabled_p ()) 3229 { 3230 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt, 3231 "reduction: unknown pattern: "); 3232 } 3233 3234 return NULL; 3235 } 3236 3237 /* Wrapper around vect_is_simple_reduction, which will modify code 3238 in-place if it enables detection of more reductions. Arguments 3239 as there. */ 3240 3241 stmt_vec_info 3242 vect_force_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info, 3243 bool *double_reduc, 3244 bool need_wrapping_integral_overflow) 3245 { 3246 enum vect_reduction_type v_reduc_type; 3247 stmt_vec_info def_info 3248 = vect_is_simple_reduction (loop_info, phi_info, double_reduc, 3249 need_wrapping_integral_overflow, 3250 &v_reduc_type); 3251 if (def_info) 3252 { 3253 STMT_VINFO_REDUC_TYPE (phi_info) = v_reduc_type; 3254 STMT_VINFO_REDUC_DEF (phi_info) = def_info; 3255 STMT_VINFO_REDUC_TYPE (def_info) = v_reduc_type; 3256 STMT_VINFO_REDUC_DEF (def_info) = phi_info; 3257 } 3258 return def_info; 3259 } 3260 3261 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */ 3262 int 3263 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue, 3264 int *peel_iters_epilogue, 3265 stmt_vector_for_cost *scalar_cost_vec, 3266 stmt_vector_for_cost *prologue_cost_vec, 3267 stmt_vector_for_cost *epilogue_cost_vec) 3268 { 3269 int retval = 0; 3270 int assumed_vf = vect_vf_for_cost (loop_vinfo); 3271 3272 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) 3273 { 3274 *peel_iters_epilogue = assumed_vf / 2; 3275 if (dump_enabled_p ()) 3276 dump_printf_loc (MSG_NOTE, vect_location, 3277 "cost model: epilogue peel iters set to vf/2 " 3278 "because loop iterations are unknown .\n"); 3279 3280 /* If peeled iterations are known but number of scalar loop 3281 iterations are unknown, count a taken branch per peeled loop. */ 3282 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken, 3283 NULL, 0, vect_prologue); 3284 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken, 3285 NULL, 0, vect_epilogue); 3286 } 3287 else 3288 { 3289 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo); 3290 peel_iters_prologue = niters < peel_iters_prologue ? 3291 niters : peel_iters_prologue; 3292 *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf; 3293 /* If we need to peel for gaps, but no peeling is required, we have to 3294 peel VF iterations. */ 3295 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue) 3296 *peel_iters_epilogue = assumed_vf; 3297 } 3298 3299 stmt_info_for_cost *si; 3300 int j; 3301 if (peel_iters_prologue) 3302 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si) 3303 retval += record_stmt_cost (prologue_cost_vec, 3304 si->count * peel_iters_prologue, 3305 si->kind, si->stmt_info, si->misalign, 3306 vect_prologue); 3307 if (*peel_iters_epilogue) 3308 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si) 3309 retval += record_stmt_cost (epilogue_cost_vec, 3310 si->count * *peel_iters_epilogue, 3311 si->kind, si->stmt_info, si->misalign, 3312 vect_epilogue); 3313 3314 return retval; 3315 } 3316 3317 /* Function vect_estimate_min_profitable_iters 3318 3319 Return the number of iterations required for the vector version of the 3320 loop to be profitable relative to the cost of the scalar version of the 3321 loop. 3322 3323 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold 3324 of iterations for vectorization. -1 value means loop vectorization 3325 is not profitable. This returned value may be used for dynamic 3326 profitability check. 3327 3328 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used 3329 for static check against estimated number of iterations. */ 3330 3331 static void 3332 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo, 3333 int *ret_min_profitable_niters, 3334 int *ret_min_profitable_estimate) 3335 { 3336 int min_profitable_iters; 3337 int min_profitable_estimate; 3338 int peel_iters_prologue; 3339 int peel_iters_epilogue; 3340 unsigned vec_inside_cost = 0; 3341 int vec_outside_cost = 0; 3342 unsigned vec_prologue_cost = 0; 3343 unsigned vec_epilogue_cost = 0; 3344 int scalar_single_iter_cost = 0; 3345 int scalar_outside_cost = 0; 3346 int assumed_vf = vect_vf_for_cost (loop_vinfo); 3347 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); 3348 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo); 3349 3350 /* Cost model disabled. */ 3351 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo))) 3352 { 3353 if (dump_enabled_p ()) 3354 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n"); 3355 *ret_min_profitable_niters = 0; 3356 *ret_min_profitable_estimate = 0; 3357 return; 3358 } 3359 3360 /* Requires loop versioning tests to handle misalignment. */ 3361 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)) 3362 { 3363 /* FIXME: Make cost depend on complexity of individual check. */ 3364 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length (); 3365 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0, 3366 vect_prologue); 3367 if (dump_enabled_p ()) 3368 dump_printf (MSG_NOTE, 3369 "cost model: Adding cost of checks for loop " 3370 "versioning to treat misalignment.\n"); 3371 } 3372 3373 /* Requires loop versioning with alias checks. */ 3374 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)) 3375 { 3376 /* FIXME: Make cost depend on complexity of individual check. */ 3377 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length (); 3378 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0, 3379 vect_prologue); 3380 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length (); 3381 if (len) 3382 /* Count LEN - 1 ANDs and LEN comparisons. */ 3383 (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt, 3384 NULL, 0, vect_prologue); 3385 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length (); 3386 if (len) 3387 { 3388 /* Count LEN - 1 ANDs and LEN comparisons. */ 3389 unsigned int nstmts = len * 2 - 1; 3390 /* +1 for each bias that needs adding. */ 3391 for (unsigned int i = 0; i < len; ++i) 3392 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p) 3393 nstmts += 1; 3394 (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt, 3395 NULL, 0, vect_prologue); 3396 } 3397 if (dump_enabled_p ()) 3398 dump_printf (MSG_NOTE, 3399 "cost model: Adding cost of checks for loop " 3400 "versioning aliasing.\n"); 3401 } 3402 3403 /* Requires loop versioning with niter checks. */ 3404 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo)) 3405 { 3406 /* FIXME: Make cost depend on complexity of individual check. */ 3407 (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0, 3408 vect_prologue); 3409 if (dump_enabled_p ()) 3410 dump_printf (MSG_NOTE, 3411 "cost model: Adding cost of checks for loop " 3412 "versioning niters.\n"); 3413 } 3414 3415 if (LOOP_REQUIRES_VERSIONING (loop_vinfo)) 3416 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0, 3417 vect_prologue); 3418 3419 /* Count statements in scalar loop. Using this as scalar cost for a single 3420 iteration for now. 3421 3422 TODO: Add outer loop support. 3423 3424 TODO: Consider assigning different costs to different scalar 3425 statements. */ 3426 3427 scalar_single_iter_cost 3428 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo); 3429 3430 /* Add additional cost for the peeled instructions in prologue and epilogue 3431 loop. (For fully-masked loops there will be no peeling.) 3432 3433 FORNOW: If we don't know the value of peel_iters for prologue or epilogue 3434 at compile-time - we assume it's vf/2 (the worst would be vf-1). 3435 3436 TODO: Build an expression that represents peel_iters for prologue and 3437 epilogue to be used in a run-time test. */ 3438 3439 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) 3440 { 3441 peel_iters_prologue = 0; 3442 peel_iters_epilogue = 0; 3443 3444 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)) 3445 { 3446 /* We need to peel exactly one iteration. */ 3447 peel_iters_epilogue += 1; 3448 stmt_info_for_cost *si; 3449 int j; 3450 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), 3451 j, si) 3452 (void) add_stmt_cost (target_cost_data, si->count, 3453 si->kind, si->stmt_info, si->misalign, 3454 vect_epilogue); 3455 } 3456 } 3457 else if (npeel < 0) 3458 { 3459 peel_iters_prologue = assumed_vf / 2; 3460 if (dump_enabled_p ()) 3461 dump_printf (MSG_NOTE, "cost model: " 3462 "prologue peel iters set to vf/2.\n"); 3463 3464 /* If peeling for alignment is unknown, loop bound of main loop becomes 3465 unknown. */ 3466 peel_iters_epilogue = assumed_vf / 2; 3467 if (dump_enabled_p ()) 3468 dump_printf (MSG_NOTE, "cost model: " 3469 "epilogue peel iters set to vf/2 because " 3470 "peeling for alignment is unknown.\n"); 3471 3472 /* If peeled iterations are unknown, count a taken branch and a not taken 3473 branch per peeled loop. Even if scalar loop iterations are known, 3474 vector iterations are not known since peeled prologue iterations are 3475 not known. Hence guards remain the same. */ 3476 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, 3477 NULL, 0, vect_prologue); 3478 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken, 3479 NULL, 0, vect_prologue); 3480 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, 3481 NULL, 0, vect_epilogue); 3482 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken, 3483 NULL, 0, vect_epilogue); 3484 stmt_info_for_cost *si; 3485 int j; 3486 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si) 3487 { 3488 (void) add_stmt_cost (target_cost_data, 3489 si->count * peel_iters_prologue, 3490 si->kind, si->stmt_info, si->misalign, 3491 vect_prologue); 3492 (void) add_stmt_cost (target_cost_data, 3493 si->count * peel_iters_epilogue, 3494 si->kind, si->stmt_info, si->misalign, 3495 vect_epilogue); 3496 } 3497 } 3498 else 3499 { 3500 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec; 3501 stmt_info_for_cost *si; 3502 int j; 3503 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo); 3504 3505 prologue_cost_vec.create (2); 3506 epilogue_cost_vec.create (2); 3507 peel_iters_prologue = npeel; 3508 3509 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue, 3510 &peel_iters_epilogue, 3511 &LOOP_VINFO_SCALAR_ITERATION_COST 3512 (loop_vinfo), 3513 &prologue_cost_vec, 3514 &epilogue_cost_vec); 3515 3516 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si) 3517 (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info, 3518 si->misalign, vect_prologue); 3519 3520 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si) 3521 (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info, 3522 si->misalign, vect_epilogue); 3523 3524 prologue_cost_vec.release (); 3525 epilogue_cost_vec.release (); 3526 } 3527 3528 /* FORNOW: The scalar outside cost is incremented in one of the 3529 following ways: 3530 3531 1. The vectorizer checks for alignment and aliasing and generates 3532 a condition that allows dynamic vectorization. A cost model 3533 check is ANDED with the versioning condition. Hence scalar code 3534 path now has the added cost of the versioning check. 3535 3536 if (cost > th & versioning_check) 3537 jmp to vector code 3538 3539 Hence run-time scalar is incremented by not-taken branch cost. 3540 3541 2. The vectorizer then checks if a prologue is required. If the 3542 cost model check was not done before during versioning, it has to 3543 be done before the prologue check. 3544 3545 if (cost <= th) 3546 prologue = scalar_iters 3547 if (prologue == 0) 3548 jmp to vector code 3549 else 3550 execute prologue 3551 if (prologue == num_iters) 3552 go to exit 3553 3554 Hence the run-time scalar cost is incremented by a taken branch, 3555 plus a not-taken branch, plus a taken branch cost. 3556 3557 3. The vectorizer then checks if an epilogue is required. If the 3558 cost model check was not done before during prologue check, it 3559 has to be done with the epilogue check. 3560 3561 if (prologue == 0) 3562 jmp to vector code 3563 else 3564 execute prologue 3565 if (prologue == num_iters) 3566 go to exit 3567 vector code: 3568 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0)) 3569 jmp to epilogue 3570 3571 Hence the run-time scalar cost should be incremented by 2 taken 3572 branches. 3573 3574 TODO: The back end may reorder the BBS's differently and reverse 3575 conditions/branch directions. Change the estimates below to 3576 something more reasonable. */ 3577 3578 /* If the number of iterations is known and we do not do versioning, we can 3579 decide whether to vectorize at compile time. Hence the scalar version 3580 do not carry cost model guard costs. */ 3581 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) 3582 || LOOP_REQUIRES_VERSIONING (loop_vinfo)) 3583 { 3584 /* Cost model check occurs at versioning. */ 3585 if (LOOP_REQUIRES_VERSIONING (loop_vinfo)) 3586 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken); 3587 else 3588 { 3589 /* Cost model check occurs at prologue generation. */ 3590 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0) 3591 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken) 3592 + vect_get_stmt_cost (cond_branch_not_taken); 3593 /* Cost model check occurs at epilogue generation. */ 3594 else 3595 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken); 3596 } 3597 } 3598 3599 /* Complete the target-specific cost calculations. */ 3600 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost, 3601 &vec_inside_cost, &vec_epilogue_cost); 3602 3603 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost); 3604 3605 if (dump_enabled_p ()) 3606 { 3607 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n"); 3608 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n", 3609 vec_inside_cost); 3610 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", 3611 vec_prologue_cost); 3612 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", 3613 vec_epilogue_cost); 3614 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n", 3615 scalar_single_iter_cost); 3616 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n", 3617 scalar_outside_cost); 3618 dump_printf (MSG_NOTE, " Vector outside cost: %d\n", 3619 vec_outside_cost); 3620 dump_printf (MSG_NOTE, " prologue iterations: %d\n", 3621 peel_iters_prologue); 3622 dump_printf (MSG_NOTE, " epilogue iterations: %d\n", 3623 peel_iters_epilogue); 3624 } 3625 3626 /* Calculate number of iterations required to make the vector version 3627 profitable, relative to the loop bodies only. The following condition 3628 must hold true: 3629 SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC 3630 where 3631 SIC = scalar iteration cost, VIC = vector iteration cost, 3632 VOC = vector outside cost, VF = vectorization factor, 3633 NPEEL = prologue iterations + epilogue iterations, 3634 SOC = scalar outside cost for run time cost model check. */ 3635 3636 int saving_per_viter = (scalar_single_iter_cost * assumed_vf 3637 - vec_inside_cost); 3638 if (saving_per_viter <= 0) 3639 { 3640 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize) 3641 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd, 3642 "vectorization did not happen for a simd loop"); 3643 3644 if (dump_enabled_p ()) 3645 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3646 "cost model: the vector iteration cost = %d " 3647 "divided by the scalar iteration cost = %d " 3648 "is greater or equal to the vectorization factor = %d" 3649 ".\n", 3650 vec_inside_cost, scalar_single_iter_cost, assumed_vf); 3651 *ret_min_profitable_niters = -1; 3652 *ret_min_profitable_estimate = -1; 3653 return; 3654 } 3655 3656 /* ??? The "if" arm is written to handle all cases; see below for what 3657 we would do for !LOOP_VINFO_FULLY_MASKED_P. */ 3658 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) 3659 { 3660 /* Rewriting the condition above in terms of the number of 3661 vector iterations (vniters) rather than the number of 3662 scalar iterations (niters) gives: 3663 3664 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC 3665 3666 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC 3667 3668 For integer N, X and Y when X > 0: 3669 3670 N * X > Y <==> N >= (Y /[floor] X) + 1. */ 3671 int outside_overhead = (vec_outside_cost 3672 - scalar_single_iter_cost * peel_iters_prologue 3673 - scalar_single_iter_cost * peel_iters_epilogue 3674 - scalar_outside_cost); 3675 /* We're only interested in cases that require at least one 3676 vector iteration. */ 3677 int min_vec_niters = 1; 3678 if (outside_overhead > 0) 3679 min_vec_niters = outside_overhead / saving_per_viter + 1; 3680 3681 if (dump_enabled_p ()) 3682 dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n", 3683 min_vec_niters); 3684 3685 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) 3686 { 3687 /* Now that we know the minimum number of vector iterations, 3688 find the minimum niters for which the scalar cost is larger: 3689 3690 SIC * niters > VIC * vniters + VOC - SOC 3691 3692 We know that the minimum niters is no more than 3693 vniters * VF + NPEEL, but it might be (and often is) less 3694 than that if a partial vector iteration is cheaper than the 3695 equivalent scalar code. */ 3696 int threshold = (vec_inside_cost * min_vec_niters 3697 + vec_outside_cost 3698 - scalar_outside_cost); 3699 if (threshold <= 0) 3700 min_profitable_iters = 1; 3701 else 3702 min_profitable_iters = threshold / scalar_single_iter_cost + 1; 3703 } 3704 else 3705 /* Convert the number of vector iterations into a number of 3706 scalar iterations. */ 3707 min_profitable_iters = (min_vec_niters * assumed_vf 3708 + peel_iters_prologue 3709 + peel_iters_epilogue); 3710 } 3711 else 3712 { 3713 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) 3714 * assumed_vf 3715 - vec_inside_cost * peel_iters_prologue 3716 - vec_inside_cost * peel_iters_epilogue); 3717 if (min_profitable_iters <= 0) 3718 min_profitable_iters = 0; 3719 else 3720 { 3721 min_profitable_iters /= saving_per_viter; 3722 3723 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters) 3724 <= (((int) vec_inside_cost * min_profitable_iters) 3725 + (((int) vec_outside_cost - scalar_outside_cost) 3726 * assumed_vf))) 3727 min_profitable_iters++; 3728 } 3729 } 3730 3731 if (dump_enabled_p ()) 3732 dump_printf (MSG_NOTE, 3733 " Calculated minimum iters for profitability: %d\n", 3734 min_profitable_iters); 3735 3736 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) 3737 && min_profitable_iters < (assumed_vf + peel_iters_prologue)) 3738 /* We want the vectorized loop to execute at least once. */ 3739 min_profitable_iters = assumed_vf + peel_iters_prologue; 3740 3741 if (dump_enabled_p ()) 3742 dump_printf_loc (MSG_NOTE, vect_location, 3743 " Runtime profitability threshold = %d\n", 3744 min_profitable_iters); 3745 3746 *ret_min_profitable_niters = min_profitable_iters; 3747 3748 /* Calculate number of iterations required to make the vector version 3749 profitable, relative to the loop bodies only. 3750 3751 Non-vectorized variant is SIC * niters and it must win over vector 3752 variant on the expected loop trip count. The following condition must hold true: 3753 SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */ 3754 3755 if (vec_outside_cost <= 0) 3756 min_profitable_estimate = 0; 3757 else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) 3758 { 3759 /* This is a repeat of the code above, but with + SOC rather 3760 than - SOC. */ 3761 int outside_overhead = (vec_outside_cost 3762 - scalar_single_iter_cost * peel_iters_prologue 3763 - scalar_single_iter_cost * peel_iters_epilogue 3764 + scalar_outside_cost); 3765 int min_vec_niters = 1; 3766 if (outside_overhead > 0) 3767 min_vec_niters = outside_overhead / saving_per_viter + 1; 3768 3769 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) 3770 { 3771 int threshold = (vec_inside_cost * min_vec_niters 3772 + vec_outside_cost 3773 + scalar_outside_cost); 3774 min_profitable_estimate = threshold / scalar_single_iter_cost + 1; 3775 } 3776 else 3777 min_profitable_estimate = (min_vec_niters * assumed_vf 3778 + peel_iters_prologue 3779 + peel_iters_epilogue); 3780 } 3781 else 3782 { 3783 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) 3784 * assumed_vf 3785 - vec_inside_cost * peel_iters_prologue 3786 - vec_inside_cost * peel_iters_epilogue) 3787 / ((scalar_single_iter_cost * assumed_vf) 3788 - vec_inside_cost); 3789 } 3790 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters); 3791 if (dump_enabled_p ()) 3792 dump_printf_loc (MSG_NOTE, vect_location, 3793 " Static estimate profitability threshold = %d\n", 3794 min_profitable_estimate); 3795 3796 *ret_min_profitable_estimate = min_profitable_estimate; 3797 } 3798 3799 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET 3800 vector elements (not bits) for a vector with NELT elements. */ 3801 static void 3802 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt, 3803 vec_perm_builder *sel) 3804 { 3805 /* The encoding is a single stepped pattern. Any wrap-around is handled 3806 by vec_perm_indices. */ 3807 sel->new_vector (nelt, 1, 3); 3808 for (unsigned int i = 0; i < 3; i++) 3809 sel->quick_push (i + offset); 3810 } 3811 3812 /* Checks whether the target supports whole-vector shifts for vectors of mode 3813 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_ 3814 it supports vec_perm_const with masks for all necessary shift amounts. */ 3815 static bool 3816 have_whole_vector_shift (machine_mode mode) 3817 { 3818 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing) 3819 return true; 3820 3821 /* Variable-length vectors should be handled via the optab. */ 3822 unsigned int nelt; 3823 if (!GET_MODE_NUNITS (mode).is_constant (&nelt)) 3824 return false; 3825 3826 vec_perm_builder sel; 3827 vec_perm_indices indices; 3828 for (unsigned int i = nelt / 2; i >= 1; i /= 2) 3829 { 3830 calc_vec_perm_mask_for_shift (i, nelt, &sel); 3831 indices.new_vector (sel, 2, nelt); 3832 if (!can_vec_perm_const_p (mode, indices, false)) 3833 return false; 3834 } 3835 return true; 3836 } 3837 3838 /* TODO: Close dependency between vect_model_*_cost and vectorizable_* 3839 functions. Design better to avoid maintenance issues. */ 3840 3841 /* Function vect_model_reduction_cost. 3842 3843 Models cost for a reduction operation, including the vector ops 3844 generated within the strip-mine loop, the initial definition before 3845 the loop, and the epilogue code that must be generated. */ 3846 3847 static void 3848 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn, 3849 int ncopies, stmt_vector_for_cost *cost_vec) 3850 { 3851 int prologue_cost = 0, epilogue_cost = 0, inside_cost; 3852 enum tree_code code; 3853 optab optab; 3854 tree vectype; 3855 machine_mode mode; 3856 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 3857 struct loop *loop = NULL; 3858 3859 if (loop_vinfo) 3860 loop = LOOP_VINFO_LOOP (loop_vinfo); 3861 3862 /* Condition reductions generate two reductions in the loop. */ 3863 vect_reduction_type reduction_type 3864 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info); 3865 if (reduction_type == COND_REDUCTION) 3866 ncopies *= 2; 3867 3868 vectype = STMT_VINFO_VECTYPE (stmt_info); 3869 mode = TYPE_MODE (vectype); 3870 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info); 3871 3872 code = gimple_assign_rhs_code (orig_stmt_info->stmt); 3873 3874 if (reduction_type == EXTRACT_LAST_REDUCTION 3875 || reduction_type == FOLD_LEFT_REDUCTION) 3876 { 3877 /* No extra instructions needed in the prologue. */ 3878 prologue_cost = 0; 3879 3880 if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST) 3881 /* Count one reduction-like operation per vector. */ 3882 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar, 3883 stmt_info, 0, vect_body); 3884 else 3885 { 3886 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */ 3887 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype); 3888 inside_cost = record_stmt_cost (cost_vec, nelements, 3889 vec_to_scalar, stmt_info, 0, 3890 vect_body); 3891 inside_cost += record_stmt_cost (cost_vec, nelements, 3892 scalar_stmt, stmt_info, 0, 3893 vect_body); 3894 } 3895 } 3896 else 3897 { 3898 /* Add in cost for initial definition. 3899 For cond reduction we have four vectors: initial index, step, 3900 initial result of the data reduction, initial value of the index 3901 reduction. */ 3902 int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1; 3903 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts, 3904 scalar_to_vec, stmt_info, 0, 3905 vect_prologue); 3906 3907 /* Cost of reduction op inside loop. */ 3908 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt, 3909 stmt_info, 0, vect_body); 3910 } 3911 3912 /* Determine cost of epilogue code. 3913 3914 We have a reduction operator that will reduce the vector in one statement. 3915 Also requires scalar extract. */ 3916 3917 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info)) 3918 { 3919 if (reduc_fn != IFN_LAST) 3920 { 3921 if (reduction_type == COND_REDUCTION) 3922 { 3923 /* An EQ stmt and an COND_EXPR stmt. */ 3924 epilogue_cost += record_stmt_cost (cost_vec, 2, 3925 vector_stmt, stmt_info, 0, 3926 vect_epilogue); 3927 /* Reduction of the max index and a reduction of the found 3928 values. */ 3929 epilogue_cost += record_stmt_cost (cost_vec, 2, 3930 vec_to_scalar, stmt_info, 0, 3931 vect_epilogue); 3932 /* A broadcast of the max value. */ 3933 epilogue_cost += record_stmt_cost (cost_vec, 1, 3934 scalar_to_vec, stmt_info, 0, 3935 vect_epilogue); 3936 } 3937 else 3938 { 3939 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt, 3940 stmt_info, 0, vect_epilogue); 3941 epilogue_cost += record_stmt_cost (cost_vec, 1, 3942 vec_to_scalar, stmt_info, 0, 3943 vect_epilogue); 3944 } 3945 } 3946 else if (reduction_type == COND_REDUCTION) 3947 { 3948 unsigned estimated_nunits = vect_nunits_for_cost (vectype); 3949 /* Extraction of scalar elements. */ 3950 epilogue_cost += record_stmt_cost (cost_vec, 3951 2 * estimated_nunits, 3952 vec_to_scalar, stmt_info, 0, 3953 vect_epilogue); 3954 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */ 3955 epilogue_cost += record_stmt_cost (cost_vec, 3956 2 * estimated_nunits - 3, 3957 scalar_stmt, stmt_info, 0, 3958 vect_epilogue); 3959 } 3960 else if (reduction_type == EXTRACT_LAST_REDUCTION 3961 || reduction_type == FOLD_LEFT_REDUCTION) 3962 /* No extra instructions need in the epilogue. */ 3963 ; 3964 else 3965 { 3966 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype)); 3967 tree bitsize = 3968 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt))); 3969 int element_bitsize = tree_to_uhwi (bitsize); 3970 int nelements = vec_size_in_bits / element_bitsize; 3971 3972 if (code == COND_EXPR) 3973 code = MAX_EXPR; 3974 3975 optab = optab_for_tree_code (code, vectype, optab_default); 3976 3977 /* We have a whole vector shift available. */ 3978 if (optab != unknown_optab 3979 && VECTOR_MODE_P (mode) 3980 && optab_handler (optab, mode) != CODE_FOR_nothing 3981 && have_whole_vector_shift (mode)) 3982 { 3983 /* Final reduction via vector shifts and the reduction operator. 3984 Also requires scalar extract. */ 3985 epilogue_cost += record_stmt_cost (cost_vec, 3986 exact_log2 (nelements) * 2, 3987 vector_stmt, stmt_info, 0, 3988 vect_epilogue); 3989 epilogue_cost += record_stmt_cost (cost_vec, 1, 3990 vec_to_scalar, stmt_info, 0, 3991 vect_epilogue); 3992 } 3993 else 3994 /* Use extracts and reduction op for final reduction. For N 3995 elements, we have N extracts and N-1 reduction ops. */ 3996 epilogue_cost += record_stmt_cost (cost_vec, 3997 nelements + nelements - 1, 3998 vector_stmt, stmt_info, 0, 3999 vect_epilogue); 4000 } 4001 } 4002 4003 if (dump_enabled_p ()) 4004 dump_printf (MSG_NOTE, 4005 "vect_model_reduction_cost: inside_cost = %d, " 4006 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost, 4007 prologue_cost, epilogue_cost); 4008 } 4009 4010 4011 /* Function vect_model_induction_cost. 4012 4013 Models cost for induction operations. */ 4014 4015 static void 4016 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies, 4017 stmt_vector_for_cost *cost_vec) 4018 { 4019 unsigned inside_cost, prologue_cost; 4020 4021 if (PURE_SLP_STMT (stmt_info)) 4022 return; 4023 4024 /* loop cost for vec_loop. */ 4025 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt, 4026 stmt_info, 0, vect_body); 4027 4028 /* prologue cost for vec_init and vec_step. */ 4029 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec, 4030 stmt_info, 0, vect_prologue); 4031 4032 if (dump_enabled_p ()) 4033 dump_printf_loc (MSG_NOTE, vect_location, 4034 "vect_model_induction_cost: inside_cost = %d, " 4035 "prologue_cost = %d .\n", inside_cost, prologue_cost); 4036 } 4037 4038 4039 4040 /* Function get_initial_def_for_reduction 4041 4042 Input: 4043 STMT_VINFO - a stmt that performs a reduction operation in the loop. 4044 INIT_VAL - the initial value of the reduction variable 4045 4046 Output: 4047 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result 4048 of the reduction (used for adjusting the epilog - see below). 4049 Return a vector variable, initialized according to the operation that 4050 STMT_VINFO performs. This vector will be used as the initial value 4051 of the vector of partial results. 4052 4053 Option1 (adjust in epilog): Initialize the vector as follows: 4054 add/bit or/xor: [0,0,...,0,0] 4055 mult/bit and: [1,1,...,1,1] 4056 min/max/cond_expr: [init_val,init_val,..,init_val,init_val] 4057 and when necessary (e.g. add/mult case) let the caller know 4058 that it needs to adjust the result by init_val. 4059 4060 Option2: Initialize the vector as follows: 4061 add/bit or/xor: [init_val,0,0,...,0] 4062 mult/bit and: [init_val,1,1,...,1] 4063 min/max/cond_expr: [init_val,init_val,...,init_val] 4064 and no adjustments are needed. 4065 4066 For example, for the following code: 4067 4068 s = init_val; 4069 for (i=0;i<n;i++) 4070 s = s + a[i]; 4071 4072 STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'. 4073 For a vector of 4 units, we want to return either [0,0,0,init_val], 4074 or [0,0,0,0] and let the caller know that it needs to adjust 4075 the result at the end by 'init_val'. 4076 4077 FORNOW, we are using the 'adjust in epilog' scheme, because this way the 4078 initialization vector is simpler (same element in all entries), if 4079 ADJUSTMENT_DEF is not NULL, and Option2 otherwise. 4080 4081 A cost model should help decide between these two schemes. */ 4082 4083 tree 4084 get_initial_def_for_reduction (stmt_vec_info stmt_vinfo, tree init_val, 4085 tree *adjustment_def) 4086 { 4087 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo); 4088 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 4089 tree scalar_type = TREE_TYPE (init_val); 4090 tree vectype = get_vectype_for_scalar_type (scalar_type); 4091 enum tree_code code = gimple_assign_rhs_code (stmt_vinfo->stmt); 4092 tree def_for_init; 4093 tree init_def; 4094 REAL_VALUE_TYPE real_init_val = dconst0; 4095 int int_init_val = 0; 4096 gimple_seq stmts = NULL; 4097 4098 gcc_assert (vectype); 4099 4100 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type) 4101 || SCALAR_FLOAT_TYPE_P (scalar_type)); 4102 4103 gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo) 4104 || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father); 4105 4106 vect_reduction_type reduction_type 4107 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo); 4108 4109 switch (code) 4110 { 4111 case WIDEN_SUM_EXPR: 4112 case DOT_PROD_EXPR: 4113 case SAD_EXPR: 4114 case PLUS_EXPR: 4115 case MINUS_EXPR: 4116 case BIT_IOR_EXPR: 4117 case BIT_XOR_EXPR: 4118 case MULT_EXPR: 4119 case BIT_AND_EXPR: 4120 { 4121 /* ADJUSTMENT_DEF is NULL when called from 4122 vect_create_epilog_for_reduction to vectorize double reduction. */ 4123 if (adjustment_def) 4124 *adjustment_def = init_val; 4125 4126 if (code == MULT_EXPR) 4127 { 4128 real_init_val = dconst1; 4129 int_init_val = 1; 4130 } 4131 4132 if (code == BIT_AND_EXPR) 4133 int_init_val = -1; 4134 4135 if (SCALAR_FLOAT_TYPE_P (scalar_type)) 4136 def_for_init = build_real (scalar_type, real_init_val); 4137 else 4138 def_for_init = build_int_cst (scalar_type, int_init_val); 4139 4140 if (adjustment_def) 4141 /* Option1: the first element is '0' or '1' as well. */ 4142 init_def = gimple_build_vector_from_val (&stmts, vectype, 4143 def_for_init); 4144 else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ()) 4145 { 4146 /* Option2 (variable length): the first element is INIT_VAL. */ 4147 init_def = gimple_build_vector_from_val (&stmts, vectype, 4148 def_for_init); 4149 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT, 4150 vectype, init_def, init_val); 4151 } 4152 else 4153 { 4154 /* Option2: the first element is INIT_VAL. */ 4155 tree_vector_builder elts (vectype, 1, 2); 4156 elts.quick_push (init_val); 4157 elts.quick_push (def_for_init); 4158 init_def = gimple_build_vector (&stmts, &elts); 4159 } 4160 } 4161 break; 4162 4163 case MIN_EXPR: 4164 case MAX_EXPR: 4165 case COND_EXPR: 4166 { 4167 if (adjustment_def) 4168 { 4169 *adjustment_def = NULL_TREE; 4170 if (reduction_type != COND_REDUCTION 4171 && reduction_type != EXTRACT_LAST_REDUCTION) 4172 { 4173 init_def = vect_get_vec_def_for_operand (init_val, stmt_vinfo); 4174 break; 4175 } 4176 } 4177 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val); 4178 init_def = gimple_build_vector_from_val (&stmts, vectype, init_val); 4179 } 4180 break; 4181 4182 default: 4183 gcc_unreachable (); 4184 } 4185 4186 if (stmts) 4187 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts); 4188 return init_def; 4189 } 4190 4191 /* Get at the initial defs for the reduction PHIs in SLP_NODE. 4192 NUMBER_OF_VECTORS is the number of vector defs to create. 4193 If NEUTRAL_OP is nonnull, introducing extra elements of that 4194 value will not change the result. */ 4195 4196 static void 4197 get_initial_defs_for_reduction (slp_tree slp_node, 4198 vec<tree> *vec_oprnds, 4199 unsigned int number_of_vectors, 4200 bool reduc_chain, tree neutral_op) 4201 { 4202 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node); 4203 stmt_vec_info stmt_vinfo = stmts[0]; 4204 unsigned HOST_WIDE_INT nunits; 4205 unsigned j, number_of_places_left_in_vector; 4206 tree vector_type; 4207 unsigned int group_size = stmts.length (); 4208 unsigned int i; 4209 struct loop *loop; 4210 4211 vector_type = STMT_VINFO_VECTYPE (stmt_vinfo); 4212 4213 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def); 4214 4215 loop = (gimple_bb (stmt_vinfo->stmt))->loop_father; 4216 gcc_assert (loop); 4217 edge pe = loop_preheader_edge (loop); 4218 4219 gcc_assert (!reduc_chain || neutral_op); 4220 4221 /* NUMBER_OF_COPIES is the number of times we need to use the same values in 4222 created vectors. It is greater than 1 if unrolling is performed. 4223 4224 For example, we have two scalar operands, s1 and s2 (e.g., group of 4225 strided accesses of size two), while NUNITS is four (i.e., four scalars 4226 of this type can be packed in a vector). The output vector will contain 4227 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES 4228 will be 2). 4229 4230 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several 4231 vectors containing the operands. 4232 4233 For example, NUNITS is four as before, and the group size is 8 4234 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and 4235 {s5, s6, s7, s8}. */ 4236 4237 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits)) 4238 nunits = group_size; 4239 4240 number_of_places_left_in_vector = nunits; 4241 bool constant_p = true; 4242 tree_vector_builder elts (vector_type, nunits, 1); 4243 elts.quick_grow (nunits); 4244 gimple_seq ctor_seq = NULL; 4245 for (j = 0; j < nunits * number_of_vectors; ++j) 4246 { 4247 tree op; 4248 i = j % group_size; 4249 stmt_vinfo = stmts[i]; 4250 4251 /* Get the def before the loop. In reduction chain we have only 4252 one initial value. Else we have as many as PHIs in the group. */ 4253 if (reduc_chain) 4254 op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe); 4255 else if (((vec_oprnds->length () + 1) * nunits 4256 - number_of_places_left_in_vector >= group_size) 4257 && neutral_op) 4258 op = neutral_op; 4259 else 4260 op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe); 4261 4262 /* Create 'vect_ = {op0,op1,...,opn}'. */ 4263 number_of_places_left_in_vector--; 4264 elts[nunits - number_of_places_left_in_vector - 1] = op; 4265 if (!CONSTANT_CLASS_P (op)) 4266 constant_p = false; 4267 4268 if (number_of_places_left_in_vector == 0) 4269 { 4270 tree init; 4271 if (constant_p && !neutral_op 4272 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits) 4273 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits)) 4274 /* Build the vector directly from ELTS. */ 4275 init = gimple_build_vector (&ctor_seq, &elts); 4276 else if (neutral_op) 4277 { 4278 /* Build a vector of the neutral value and shift the 4279 other elements into place. */ 4280 init = gimple_build_vector_from_val (&ctor_seq, vector_type, 4281 neutral_op); 4282 int k = nunits; 4283 while (k > 0 && elts[k - 1] == neutral_op) 4284 k -= 1; 4285 while (k > 0) 4286 { 4287 k -= 1; 4288 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT, 4289 vector_type, init, elts[k]); 4290 } 4291 } 4292 else 4293 { 4294 /* First time round, duplicate ELTS to fill the 4295 required number of vectors. */ 4296 duplicate_and_interleave (&ctor_seq, vector_type, elts, 4297 number_of_vectors, *vec_oprnds); 4298 break; 4299 } 4300 vec_oprnds->quick_push (init); 4301 4302 number_of_places_left_in_vector = nunits; 4303 elts.new_vector (vector_type, nunits, 1); 4304 elts.quick_grow (nunits); 4305 constant_p = true; 4306 } 4307 } 4308 if (ctor_seq != NULL) 4309 gsi_insert_seq_on_edge_immediate (pe, ctor_seq); 4310 } 4311 4312 4313 /* Function vect_create_epilog_for_reduction 4314 4315 Create code at the loop-epilog to finalize the result of a reduction 4316 computation. 4317 4318 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector 4319 reduction statements. 4320 STMT_INFO is the scalar reduction stmt that is being vectorized. 4321 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the 4322 number of elements that we can fit in a vectype (nunits). In this case 4323 we have to generate more than one vector stmt - i.e - we need to "unroll" 4324 the vector stmt by a factor VF/nunits. For more details see documentation 4325 in vectorizable_operation. 4326 REDUC_FN is the internal function for the epilog reduction. 4327 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction 4328 computation. 4329 REDUC_INDEX is the index of the operand in the right hand side of the 4330 statement that is defined by REDUCTION_PHI. 4331 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled. 4332 SLP_NODE is an SLP node containing a group of reduction statements. The 4333 first one in this group is STMT_INFO. 4334 INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case 4335 when the COND_EXPR is never true in the loop. For MAX_EXPR, it needs to 4336 be smaller than any value of the IV in the loop, for MIN_EXPR larger than 4337 any value of the IV in the loop. 4338 INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION. 4339 NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is 4340 null if this is not an SLP reduction 4341 4342 This function: 4343 1. Creates the reduction def-use cycles: sets the arguments for 4344 REDUCTION_PHIS: 4345 The loop-entry argument is the vectorized initial-value of the reduction. 4346 The loop-latch argument is taken from VECT_DEFS - the vector of partial 4347 sums. 4348 2. "Reduces" each vector of partial results VECT_DEFS into a single result, 4349 by calling the function specified by REDUC_FN if available, or by 4350 other means (whole-vector shifts or a scalar loop). 4351 The function also creates a new phi node at the loop exit to preserve 4352 loop-closed form, as illustrated below. 4353 4354 The flow at the entry to this function: 4355 4356 loop: 4357 vec_def = phi <null, null> # REDUCTION_PHI 4358 VECT_DEF = vector_stmt # vectorized form of STMT_INFO 4359 s_loop = scalar_stmt # (scalar) STMT_INFO 4360 loop_exit: 4361 s_out0 = phi <s_loop> # (scalar) EXIT_PHI 4362 use <s_out0> 4363 use <s_out0> 4364 4365 The above is transformed by this function into: 4366 4367 loop: 4368 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI 4369 VECT_DEF = vector_stmt # vectorized form of STMT_INFO 4370 s_loop = scalar_stmt # (scalar) STMT_INFO 4371 loop_exit: 4372 s_out0 = phi <s_loop> # (scalar) EXIT_PHI 4373 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI 4374 v_out2 = reduce <v_out1> 4375 s_out3 = extract_field <v_out2, 0> 4376 s_out4 = adjust_result <s_out3> 4377 use <s_out4> 4378 use <s_out4> 4379 */ 4380 4381 static void 4382 vect_create_epilog_for_reduction (vec<tree> vect_defs, 4383 stmt_vec_info stmt_info, 4384 gimple *reduc_def_stmt, 4385 int ncopies, internal_fn reduc_fn, 4386 vec<stmt_vec_info> reduction_phis, 4387 bool double_reduc, 4388 slp_tree slp_node, 4389 slp_instance slp_node_instance, 4390 tree induc_val, enum tree_code induc_code, 4391 tree neutral_op) 4392 { 4393 stmt_vec_info prev_phi_info; 4394 tree vectype; 4395 machine_mode mode; 4396 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 4397 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL; 4398 basic_block exit_bb; 4399 tree scalar_dest; 4400 tree scalar_type; 4401 gimple *new_phi = NULL, *phi; 4402 stmt_vec_info phi_info; 4403 gimple_stmt_iterator exit_gsi; 4404 tree vec_dest; 4405 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest; 4406 gimple *epilog_stmt = NULL; 4407 enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt); 4408 gimple *exit_phi; 4409 tree bitsize; 4410 tree adjustment_def = NULL; 4411 tree vec_initial_def = NULL; 4412 tree expr, def, initial_def = NULL; 4413 tree orig_name, scalar_result; 4414 imm_use_iterator imm_iter, phi_imm_iter; 4415 use_operand_p use_p, phi_use_p; 4416 gimple *use_stmt; 4417 stmt_vec_info reduction_phi_info = NULL; 4418 bool nested_in_vect_loop = false; 4419 auto_vec<gimple *> new_phis; 4420 auto_vec<stmt_vec_info> inner_phis; 4421 int j, i; 4422 auto_vec<tree> scalar_results; 4423 unsigned int group_size = 1, k, ratio; 4424 auto_vec<tree> vec_initial_defs; 4425 auto_vec<gimple *> phis; 4426 bool slp_reduc = false; 4427 bool direct_slp_reduc; 4428 tree new_phi_result; 4429 stmt_vec_info inner_phi = NULL; 4430 tree induction_index = NULL_TREE; 4431 4432 if (slp_node) 4433 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length (); 4434 4435 if (nested_in_vect_loop_p (loop, stmt_info)) 4436 { 4437 outer_loop = loop; 4438 loop = loop->inner; 4439 nested_in_vect_loop = true; 4440 gcc_assert (!slp_node); 4441 } 4442 4443 vectype = STMT_VINFO_VECTYPE (stmt_info); 4444 gcc_assert (vectype); 4445 mode = TYPE_MODE (vectype); 4446 4447 /* 1. Create the reduction def-use cycle: 4448 Set the arguments of REDUCTION_PHIS, i.e., transform 4449 4450 loop: 4451 vec_def = phi <null, null> # REDUCTION_PHI 4452 VECT_DEF = vector_stmt # vectorized form of STMT 4453 ... 4454 4455 into: 4456 4457 loop: 4458 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI 4459 VECT_DEF = vector_stmt # vectorized form of STMT 4460 ... 4461 4462 (in case of SLP, do it for all the phis). */ 4463 4464 /* Get the loop-entry arguments. */ 4465 enum vect_def_type initial_def_dt = vect_unknown_def_type; 4466 if (slp_node) 4467 { 4468 unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); 4469 vec_initial_defs.reserve (vec_num); 4470 get_initial_defs_for_reduction (slp_node_instance->reduc_phis, 4471 &vec_initial_defs, vec_num, 4472 REDUC_GROUP_FIRST_ELEMENT (stmt_info), 4473 neutral_op); 4474 } 4475 else 4476 { 4477 /* Get at the scalar def before the loop, that defines the initial value 4478 of the reduction variable. */ 4479 initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt, 4480 loop_preheader_edge (loop)); 4481 /* Optimize: if initial_def is for REDUC_MAX smaller than the base 4482 and we can't use zero for induc_val, use initial_def. Similarly 4483 for REDUC_MIN and initial_def larger than the base. */ 4484 if (TREE_CODE (initial_def) == INTEGER_CST 4485 && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) 4486 == INTEGER_INDUC_COND_REDUCTION) 4487 && !integer_zerop (induc_val) 4488 && ((induc_code == MAX_EXPR 4489 && tree_int_cst_lt (initial_def, induc_val)) 4490 || (induc_code == MIN_EXPR 4491 && tree_int_cst_lt (induc_val, initial_def)))) 4492 induc_val = initial_def; 4493 4494 if (double_reduc) 4495 /* In case of double reduction we only create a vector variable 4496 to be put in the reduction phi node. The actual statement 4497 creation is done later in this function. */ 4498 vec_initial_def = vect_create_destination_var (initial_def, vectype); 4499 else if (nested_in_vect_loop) 4500 { 4501 /* Do not use an adjustment def as that case is not supported 4502 correctly if ncopies is not one. */ 4503 vect_is_simple_use (initial_def, loop_vinfo, &initial_def_dt); 4504 vec_initial_def = vect_get_vec_def_for_operand (initial_def, 4505 stmt_info); 4506 } 4507 else 4508 vec_initial_def 4509 = get_initial_def_for_reduction (stmt_info, initial_def, 4510 &adjustment_def); 4511 vec_initial_defs.create (1); 4512 vec_initial_defs.quick_push (vec_initial_def); 4513 } 4514 4515 /* Set phi nodes arguments. */ 4516 FOR_EACH_VEC_ELT (reduction_phis, i, phi_info) 4517 { 4518 tree vec_init_def = vec_initial_defs[i]; 4519 tree def = vect_defs[i]; 4520 for (j = 0; j < ncopies; j++) 4521 { 4522 if (j != 0) 4523 { 4524 phi_info = STMT_VINFO_RELATED_STMT (phi_info); 4525 if (nested_in_vect_loop) 4526 vec_init_def 4527 = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_init_def); 4528 } 4529 4530 /* Set the loop-entry arg of the reduction-phi. */ 4531 4532 gphi *phi = as_a <gphi *> (phi_info->stmt); 4533 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) 4534 == INTEGER_INDUC_COND_REDUCTION) 4535 { 4536 /* Initialise the reduction phi to zero. This prevents initial 4537 values of non-zero interferring with the reduction op. */ 4538 gcc_assert (ncopies == 1); 4539 gcc_assert (i == 0); 4540 4541 tree vec_init_def_type = TREE_TYPE (vec_init_def); 4542 tree induc_val_vec 4543 = build_vector_from_val (vec_init_def_type, induc_val); 4544 4545 add_phi_arg (phi, induc_val_vec, loop_preheader_edge (loop), 4546 UNKNOWN_LOCATION); 4547 } 4548 else 4549 add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop), 4550 UNKNOWN_LOCATION); 4551 4552 /* Set the loop-latch arg for the reduction-phi. */ 4553 if (j > 0) 4554 def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def); 4555 4556 add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION); 4557 4558 if (dump_enabled_p ()) 4559 dump_printf_loc (MSG_NOTE, vect_location, 4560 "transform reduction: created def-use cycle: %G%G", 4561 phi, SSA_NAME_DEF_STMT (def)); 4562 } 4563 } 4564 4565 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR) 4566 which is updated with the current index of the loop for every match of 4567 the original loop's cond_expr (VEC_STMT). This results in a vector 4568 containing the last time the condition passed for that vector lane. 4569 The first match will be a 1 to allow 0 to be used for non-matching 4570 indexes. If there are no matches at all then the vector will be all 4571 zeroes. */ 4572 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION) 4573 { 4574 tree indx_before_incr, indx_after_incr; 4575 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype); 4576 4577 gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info)->stmt; 4578 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR); 4579 4580 int scalar_precision 4581 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype))); 4582 tree cr_index_scalar_type = make_unsigned_type (scalar_precision); 4583 tree cr_index_vector_type = build_vector_type 4584 (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype)); 4585 4586 /* First we create a simple vector induction variable which starts 4587 with the values {1,2,3,...} (SERIES_VECT) and increments by the 4588 vector size (STEP). */ 4589 4590 /* Create a {1,2,3,...} vector. */ 4591 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1); 4592 4593 /* Create a vector of the step value. */ 4594 tree step = build_int_cst (cr_index_scalar_type, nunits_out); 4595 tree vec_step = build_vector_from_val (cr_index_vector_type, step); 4596 4597 /* Create an induction variable. */ 4598 gimple_stmt_iterator incr_gsi; 4599 bool insert_after; 4600 standard_iv_increment_position (loop, &incr_gsi, &insert_after); 4601 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi, 4602 insert_after, &indx_before_incr, &indx_after_incr); 4603 4604 /* Next create a new phi node vector (NEW_PHI_TREE) which starts 4605 filled with zeros (VEC_ZERO). */ 4606 4607 /* Create a vector of 0s. */ 4608 tree zero = build_zero_cst (cr_index_scalar_type); 4609 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero); 4610 4611 /* Create a vector phi node. */ 4612 tree new_phi_tree = make_ssa_name (cr_index_vector_type); 4613 new_phi = create_phi_node (new_phi_tree, loop->header); 4614 loop_vinfo->add_stmt (new_phi); 4615 add_phi_arg (as_a <gphi *> (new_phi), vec_zero, 4616 loop_preheader_edge (loop), UNKNOWN_LOCATION); 4617 4618 /* Now take the condition from the loops original cond_expr 4619 (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for 4620 every match uses values from the induction variable 4621 (INDEX_BEFORE_INCR) otherwise uses values from the phi node 4622 (NEW_PHI_TREE). 4623 Finally, we update the phi (NEW_PHI_TREE) to take the value of 4624 the new cond_expr (INDEX_COND_EXPR). */ 4625 4626 /* Duplicate the condition from vec_stmt. */ 4627 tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt)); 4628 4629 /* Create a conditional, where the condition is taken from vec_stmt 4630 (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and 4631 else is the phi (NEW_PHI_TREE). */ 4632 tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type, 4633 ccompare, indx_before_incr, 4634 new_phi_tree); 4635 induction_index = make_ssa_name (cr_index_vector_type); 4636 gimple *index_condition = gimple_build_assign (induction_index, 4637 index_cond_expr); 4638 gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT); 4639 stmt_vec_info index_vec_info = loop_vinfo->add_stmt (index_condition); 4640 STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type; 4641 4642 /* Update the phi with the vec cond. */ 4643 add_phi_arg (as_a <gphi *> (new_phi), induction_index, 4644 loop_latch_edge (loop), UNKNOWN_LOCATION); 4645 } 4646 4647 /* 2. Create epilog code. 4648 The reduction epilog code operates across the elements of the vector 4649 of partial results computed by the vectorized loop. 4650 The reduction epilog code consists of: 4651 4652 step 1: compute the scalar result in a vector (v_out2) 4653 step 2: extract the scalar result (s_out3) from the vector (v_out2) 4654 step 3: adjust the scalar result (s_out3) if needed. 4655 4656 Step 1 can be accomplished using one the following three schemes: 4657 (scheme 1) using reduc_fn, if available. 4658 (scheme 2) using whole-vector shifts, if available. 4659 (scheme 3) using a scalar loop. In this case steps 1+2 above are 4660 combined. 4661 4662 The overall epilog code looks like this: 4663 4664 s_out0 = phi <s_loop> # original EXIT_PHI 4665 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI 4666 v_out2 = reduce <v_out1> # step 1 4667 s_out3 = extract_field <v_out2, 0> # step 2 4668 s_out4 = adjust_result <s_out3> # step 3 4669 4670 (step 3 is optional, and steps 1 and 2 may be combined). 4671 Lastly, the uses of s_out0 are replaced by s_out4. */ 4672 4673 4674 /* 2.1 Create new loop-exit-phis to preserve loop-closed form: 4675 v_out1 = phi <VECT_DEF> 4676 Store them in NEW_PHIS. */ 4677 4678 exit_bb = single_exit (loop)->dest; 4679 prev_phi_info = NULL; 4680 new_phis.create (vect_defs.length ()); 4681 FOR_EACH_VEC_ELT (vect_defs, i, def) 4682 { 4683 for (j = 0; j < ncopies; j++) 4684 { 4685 tree new_def = copy_ssa_name (def); 4686 phi = create_phi_node (new_def, exit_bb); 4687 stmt_vec_info phi_info = loop_vinfo->add_stmt (phi); 4688 if (j == 0) 4689 new_phis.quick_push (phi); 4690 else 4691 { 4692 def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def); 4693 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi_info; 4694 } 4695 4696 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def); 4697 prev_phi_info = phi_info; 4698 } 4699 } 4700 4701 /* The epilogue is created for the outer-loop, i.e., for the loop being 4702 vectorized. Create exit phis for the outer loop. */ 4703 if (double_reduc) 4704 { 4705 loop = outer_loop; 4706 exit_bb = single_exit (loop)->dest; 4707 inner_phis.create (vect_defs.length ()); 4708 FOR_EACH_VEC_ELT (new_phis, i, phi) 4709 { 4710 stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi); 4711 tree new_result = copy_ssa_name (PHI_RESULT (phi)); 4712 gphi *outer_phi = create_phi_node (new_result, exit_bb); 4713 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx, 4714 PHI_RESULT (phi)); 4715 prev_phi_info = loop_vinfo->add_stmt (outer_phi); 4716 inner_phis.quick_push (phi_info); 4717 new_phis[i] = outer_phi; 4718 while (STMT_VINFO_RELATED_STMT (phi_info)) 4719 { 4720 phi_info = STMT_VINFO_RELATED_STMT (phi_info); 4721 new_result = copy_ssa_name (PHI_RESULT (phi_info->stmt)); 4722 outer_phi = create_phi_node (new_result, exit_bb); 4723 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx, 4724 PHI_RESULT (phi_info->stmt)); 4725 stmt_vec_info outer_phi_info = loop_vinfo->add_stmt (outer_phi); 4726 STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi_info; 4727 prev_phi_info = outer_phi_info; 4728 } 4729 } 4730 } 4731 4732 exit_gsi = gsi_after_labels (exit_bb); 4733 4734 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3 4735 (i.e. when reduc_fn is not available) and in the final adjustment 4736 code (if needed). Also get the original scalar reduction variable as 4737 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it 4738 represents a reduction pattern), the tree-code and scalar-def are 4739 taken from the original stmt that the pattern-stmt (STMT) replaces. 4740 Otherwise (it is a regular reduction) - the tree-code and scalar-def 4741 are taken from STMT. */ 4742 4743 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info); 4744 if (orig_stmt_info != stmt_info) 4745 { 4746 /* Reduction pattern */ 4747 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)); 4748 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info); 4749 } 4750 4751 code = gimple_assign_rhs_code (orig_stmt_info->stmt); 4752 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore, 4753 partial results are added and not subtracted. */ 4754 if (code == MINUS_EXPR) 4755 code = PLUS_EXPR; 4756 4757 scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt); 4758 scalar_type = TREE_TYPE (scalar_dest); 4759 scalar_results.create (group_size); 4760 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL); 4761 bitsize = TYPE_SIZE (scalar_type); 4762 4763 /* In case this is a reduction in an inner-loop while vectorizing an outer 4764 loop - we don't need to extract a single scalar result at the end of the 4765 inner-loop (unless it is double reduction, i.e., the use of reduction is 4766 outside the outer-loop). The final vector of partial results will be used 4767 in the vectorized outer-loop, or reduced to a scalar result at the end of 4768 the outer-loop. */ 4769 if (nested_in_vect_loop && !double_reduc) 4770 goto vect_finalize_reduction; 4771 4772 /* SLP reduction without reduction chain, e.g., 4773 # a1 = phi <a2, a0> 4774 # b1 = phi <b2, b0> 4775 a2 = operation (a1) 4776 b2 = operation (b1) */ 4777 slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)); 4778 4779 /* True if we should implement SLP_REDUC using native reduction operations 4780 instead of scalar operations. */ 4781 direct_slp_reduc = (reduc_fn != IFN_LAST 4782 && slp_reduc 4783 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ()); 4784 4785 /* In case of reduction chain, e.g., 4786 # a1 = phi <a3, a0> 4787 a2 = operation (a1) 4788 a3 = operation (a2), 4789 4790 we may end up with more than one vector result. Here we reduce them to 4791 one vector. */ 4792 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc) 4793 { 4794 tree first_vect = PHI_RESULT (new_phis[0]); 4795 gassign *new_vec_stmt = NULL; 4796 vec_dest = vect_create_destination_var (scalar_dest, vectype); 4797 for (k = 1; k < new_phis.length (); k++) 4798 { 4799 gimple *next_phi = new_phis[k]; 4800 tree second_vect = PHI_RESULT (next_phi); 4801 tree tem = make_ssa_name (vec_dest, new_vec_stmt); 4802 new_vec_stmt = gimple_build_assign (tem, code, 4803 first_vect, second_vect); 4804 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT); 4805 first_vect = tem; 4806 } 4807 4808 new_phi_result = first_vect; 4809 if (new_vec_stmt) 4810 { 4811 new_phis.truncate (0); 4812 new_phis.safe_push (new_vec_stmt); 4813 } 4814 } 4815 /* Likewise if we couldn't use a single defuse cycle. */ 4816 else if (ncopies > 1) 4817 { 4818 gcc_assert (new_phis.length () == 1); 4819 tree first_vect = PHI_RESULT (new_phis[0]); 4820 gassign *new_vec_stmt = NULL; 4821 vec_dest = vect_create_destination_var (scalar_dest, vectype); 4822 stmt_vec_info next_phi_info = loop_vinfo->lookup_stmt (new_phis[0]); 4823 for (int k = 1; k < ncopies; ++k) 4824 { 4825 next_phi_info = STMT_VINFO_RELATED_STMT (next_phi_info); 4826 tree second_vect = PHI_RESULT (next_phi_info->stmt); 4827 tree tem = make_ssa_name (vec_dest, new_vec_stmt); 4828 new_vec_stmt = gimple_build_assign (tem, code, 4829 first_vect, second_vect); 4830 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT); 4831 first_vect = tem; 4832 } 4833 new_phi_result = first_vect; 4834 new_phis.truncate (0); 4835 new_phis.safe_push (new_vec_stmt); 4836 } 4837 else 4838 new_phi_result = PHI_RESULT (new_phis[0]); 4839 4840 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION 4841 && reduc_fn != IFN_LAST) 4842 { 4843 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing 4844 various data values where the condition matched and another vector 4845 (INDUCTION_INDEX) containing all the indexes of those matches. We 4846 need to extract the last matching index (which will be the index with 4847 highest value) and use this to index into the data vector. 4848 For the case where there were no matches, the data vector will contain 4849 all default values and the index vector will be all zeros. */ 4850 4851 /* Get various versions of the type of the vector of indexes. */ 4852 tree index_vec_type = TREE_TYPE (induction_index); 4853 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type)); 4854 tree index_scalar_type = TREE_TYPE (index_vec_type); 4855 tree index_vec_cmp_type = build_same_sized_truth_vector_type 4856 (index_vec_type); 4857 4858 /* Get an unsigned integer version of the type of the data vector. */ 4859 int scalar_precision 4860 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type)); 4861 tree scalar_type_unsigned = make_unsigned_type (scalar_precision); 4862 tree vectype_unsigned = build_vector_type 4863 (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype)); 4864 4865 /* First we need to create a vector (ZERO_VEC) of zeros and another 4866 vector (MAX_INDEX_VEC) filled with the last matching index, which we 4867 can create using a MAX reduction and then expanding. 4868 In the case where the loop never made any matches, the max index will 4869 be zero. */ 4870 4871 /* Vector of {0, 0, 0,...}. */ 4872 tree zero_vec = make_ssa_name (vectype); 4873 tree zero_vec_rhs = build_zero_cst (vectype); 4874 gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs); 4875 gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT); 4876 4877 /* Find maximum value from the vector of found indexes. */ 4878 tree max_index = make_ssa_name (index_scalar_type); 4879 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX, 4880 1, induction_index); 4881 gimple_call_set_lhs (max_index_stmt, max_index); 4882 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT); 4883 4884 /* Vector of {max_index, max_index, max_index,...}. */ 4885 tree max_index_vec = make_ssa_name (index_vec_type); 4886 tree max_index_vec_rhs = build_vector_from_val (index_vec_type, 4887 max_index); 4888 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec, 4889 max_index_vec_rhs); 4890 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT); 4891 4892 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes 4893 with the vector (INDUCTION_INDEX) of found indexes, choosing values 4894 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC) 4895 otherwise. Only one value should match, resulting in a vector 4896 (VEC_COND) with one data value and the rest zeros. 4897 In the case where the loop never made any matches, every index will 4898 match, resulting in a vector with all data values (which will all be 4899 the default value). */ 4900 4901 /* Compare the max index vector to the vector of found indexes to find 4902 the position of the max value. */ 4903 tree vec_compare = make_ssa_name (index_vec_cmp_type); 4904 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR, 4905 induction_index, 4906 max_index_vec); 4907 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT); 4908 4909 /* Use the compare to choose either values from the data vector or 4910 zero. */ 4911 tree vec_cond = make_ssa_name (vectype); 4912 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR, 4913 vec_compare, new_phi_result, 4914 zero_vec); 4915 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT); 4916 4917 /* Finally we need to extract the data value from the vector (VEC_COND) 4918 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR 4919 reduction, but because this doesn't exist, we can use a MAX reduction 4920 instead. The data value might be signed or a float so we need to cast 4921 it first. 4922 In the case where the loop never made any matches, the data values are 4923 all identical, and so will reduce down correctly. */ 4924 4925 /* Make the matched data values unsigned. */ 4926 tree vec_cond_cast = make_ssa_name (vectype_unsigned); 4927 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned, 4928 vec_cond); 4929 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast, 4930 VIEW_CONVERT_EXPR, 4931 vec_cond_cast_rhs); 4932 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT); 4933 4934 /* Reduce down to a scalar value. */ 4935 tree data_reduc = make_ssa_name (scalar_type_unsigned); 4936 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX, 4937 1, vec_cond_cast); 4938 gimple_call_set_lhs (data_reduc_stmt, data_reduc); 4939 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT); 4940 4941 /* Convert the reduced value back to the result type and set as the 4942 result. */ 4943 gimple_seq stmts = NULL; 4944 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type, 4945 data_reduc); 4946 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); 4947 scalar_results.safe_push (new_temp); 4948 } 4949 else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION 4950 && reduc_fn == IFN_LAST) 4951 { 4952 /* Condition reduction without supported IFN_REDUC_MAX. Generate 4953 idx = 0; 4954 idx_val = induction_index[0]; 4955 val = data_reduc[0]; 4956 for (idx = 0, val = init, i = 0; i < nelts; ++i) 4957 if (induction_index[i] > idx_val) 4958 val = data_reduc[i], idx_val = induction_index[i]; 4959 return val; */ 4960 4961 tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result)); 4962 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index)); 4963 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype)); 4964 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index)); 4965 /* Enforced by vectorizable_reduction, which ensures we have target 4966 support before allowing a conditional reduction on variable-length 4967 vectors. */ 4968 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant (); 4969 tree idx_val = NULL_TREE, val = NULL_TREE; 4970 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size) 4971 { 4972 tree old_idx_val = idx_val; 4973 tree old_val = val; 4974 idx_val = make_ssa_name (idx_eltype); 4975 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF, 4976 build3 (BIT_FIELD_REF, idx_eltype, 4977 induction_index, 4978 bitsize_int (el_size), 4979 bitsize_int (off))); 4980 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 4981 val = make_ssa_name (data_eltype); 4982 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF, 4983 build3 (BIT_FIELD_REF, 4984 data_eltype, 4985 new_phi_result, 4986 bitsize_int (el_size), 4987 bitsize_int (off))); 4988 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 4989 if (off != 0) 4990 { 4991 tree new_idx_val = idx_val; 4992 tree new_val = val; 4993 if (off != v_size - el_size) 4994 { 4995 new_idx_val = make_ssa_name (idx_eltype); 4996 epilog_stmt = gimple_build_assign (new_idx_val, 4997 MAX_EXPR, idx_val, 4998 old_idx_val); 4999 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5000 } 5001 new_val = make_ssa_name (data_eltype); 5002 epilog_stmt = gimple_build_assign (new_val, 5003 COND_EXPR, 5004 build2 (GT_EXPR, 5005 boolean_type_node, 5006 idx_val, 5007 old_idx_val), 5008 val, old_val); 5009 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5010 idx_val = new_idx_val; 5011 val = new_val; 5012 } 5013 } 5014 /* Convert the reduced value back to the result type and set as the 5015 result. */ 5016 gimple_seq stmts = NULL; 5017 val = gimple_convert (&stmts, scalar_type, val); 5018 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); 5019 scalar_results.safe_push (val); 5020 } 5021 5022 /* 2.3 Create the reduction code, using one of the three schemes described 5023 above. In SLP we simply need to extract all the elements from the 5024 vector (without reducing them), so we use scalar shifts. */ 5025 else if (reduc_fn != IFN_LAST && !slp_reduc) 5026 { 5027 tree tmp; 5028 tree vec_elem_type; 5029 5030 /* Case 1: Create: 5031 v_out2 = reduc_expr <v_out1> */ 5032 5033 if (dump_enabled_p ()) 5034 dump_printf_loc (MSG_NOTE, vect_location, 5035 "Reduce using direct vector reduction.\n"); 5036 5037 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result)); 5038 if (!useless_type_conversion_p (scalar_type, vec_elem_type)) 5039 { 5040 tree tmp_dest 5041 = vect_create_destination_var (scalar_dest, vec_elem_type); 5042 epilog_stmt = gimple_build_call_internal (reduc_fn, 1, 5043 new_phi_result); 5044 gimple_set_lhs (epilog_stmt, tmp_dest); 5045 new_temp = make_ssa_name (tmp_dest, epilog_stmt); 5046 gimple_set_lhs (epilog_stmt, new_temp); 5047 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5048 5049 epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR, 5050 new_temp); 5051 } 5052 else 5053 { 5054 epilog_stmt = gimple_build_call_internal (reduc_fn, 1, 5055 new_phi_result); 5056 gimple_set_lhs (epilog_stmt, new_scalar_dest); 5057 } 5058 5059 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); 5060 gimple_set_lhs (epilog_stmt, new_temp); 5061 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5062 5063 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) 5064 == INTEGER_INDUC_COND_REDUCTION) 5065 && !operand_equal_p (initial_def, induc_val, 0)) 5066 { 5067 /* Earlier we set the initial value to be a vector if induc_val 5068 values. Check the result and if it is induc_val then replace 5069 with the original initial value, unless induc_val is 5070 the same as initial_def already. */ 5071 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp, 5072 induc_val); 5073 5074 tmp = make_ssa_name (new_scalar_dest); 5075 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare, 5076 initial_def, new_temp); 5077 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5078 new_temp = tmp; 5079 } 5080 5081 scalar_results.safe_push (new_temp); 5082 } 5083 else if (direct_slp_reduc) 5084 { 5085 /* Here we create one vector for each of the REDUC_GROUP_SIZE results, 5086 with the elements for other SLP statements replaced with the 5087 neutral value. We can then do a normal reduction on each vector. */ 5088 5089 /* Enforced by vectorizable_reduction. */ 5090 gcc_assert (new_phis.length () == 1); 5091 gcc_assert (pow2p_hwi (group_size)); 5092 5093 slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis; 5094 vec<stmt_vec_info> orig_phis 5095 = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node); 5096 gimple_seq seq = NULL; 5097 5098 /* Build a vector {0, 1, 2, ...}, with the same number of elements 5099 and the same element size as VECTYPE. */ 5100 tree index = build_index_vector (vectype, 0, 1); 5101 tree index_type = TREE_TYPE (index); 5102 tree index_elt_type = TREE_TYPE (index_type); 5103 tree mask_type = build_same_sized_truth_vector_type (index_type); 5104 5105 /* Create a vector that, for each element, identifies which of 5106 the REDUC_GROUP_SIZE results should use it. */ 5107 tree index_mask = build_int_cst (index_elt_type, group_size - 1); 5108 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index, 5109 build_vector_from_val (index_type, index_mask)); 5110 5111 /* Get a neutral vector value. This is simply a splat of the neutral 5112 scalar value if we have one, otherwise the initial scalar value 5113 is itself a neutral value. */ 5114 tree vector_identity = NULL_TREE; 5115 if (neutral_op) 5116 vector_identity = gimple_build_vector_from_val (&seq, vectype, 5117 neutral_op); 5118 for (unsigned int i = 0; i < group_size; ++i) 5119 { 5120 /* If there's no univeral neutral value, we can use the 5121 initial scalar value from the original PHI. This is used 5122 for MIN and MAX reduction, for example. */ 5123 if (!neutral_op) 5124 { 5125 tree scalar_value 5126 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt, 5127 loop_preheader_edge (loop)); 5128 vector_identity = gimple_build_vector_from_val (&seq, vectype, 5129 scalar_value); 5130 } 5131 5132 /* Calculate the equivalent of: 5133 5134 sel[j] = (index[j] == i); 5135 5136 which selects the elements of NEW_PHI_RESULT that should 5137 be included in the result. */ 5138 tree compare_val = build_int_cst (index_elt_type, i); 5139 compare_val = build_vector_from_val (index_type, compare_val); 5140 tree sel = gimple_build (&seq, EQ_EXPR, mask_type, 5141 index, compare_val); 5142 5143 /* Calculate the equivalent of: 5144 5145 vec = seq ? new_phi_result : vector_identity; 5146 5147 VEC is now suitable for a full vector reduction. */ 5148 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype, 5149 sel, new_phi_result, vector_identity); 5150 5151 /* Do the reduction and convert it to the appropriate type. */ 5152 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn), 5153 TREE_TYPE (vectype), vec); 5154 scalar = gimple_convert (&seq, scalar_type, scalar); 5155 scalar_results.safe_push (scalar); 5156 } 5157 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT); 5158 } 5159 else 5160 { 5161 bool reduce_with_shift; 5162 tree vec_temp; 5163 5164 /* COND reductions all do the final reduction with MAX_EXPR 5165 or MIN_EXPR. */ 5166 if (code == COND_EXPR) 5167 { 5168 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) 5169 == INTEGER_INDUC_COND_REDUCTION) 5170 code = induc_code; 5171 else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) 5172 == CONST_COND_REDUCTION) 5173 code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info); 5174 else 5175 code = MAX_EXPR; 5176 } 5177 5178 /* See if the target wants to do the final (shift) reduction 5179 in a vector mode of smaller size and first reduce upper/lower 5180 halves against each other. */ 5181 enum machine_mode mode1 = mode; 5182 tree vectype1 = vectype; 5183 unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype)); 5184 unsigned sz1 = sz; 5185 if (!slp_reduc 5186 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode) 5187 sz1 = GET_MODE_SIZE (mode1).to_constant (); 5188 5189 vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1); 5190 reduce_with_shift = have_whole_vector_shift (mode1); 5191 if (!VECTOR_MODE_P (mode1)) 5192 reduce_with_shift = false; 5193 else 5194 { 5195 optab optab = optab_for_tree_code (code, vectype1, optab_default); 5196 if (optab_handler (optab, mode1) == CODE_FOR_nothing) 5197 reduce_with_shift = false; 5198 } 5199 5200 /* First reduce the vector to the desired vector size we should 5201 do shift reduction on by combining upper and lower halves. */ 5202 new_temp = new_phi_result; 5203 while (sz > sz1) 5204 { 5205 gcc_assert (!slp_reduc); 5206 sz /= 2; 5207 vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz); 5208 5209 /* The target has to make sure we support lowpart/highpart 5210 extraction, either via direct vector extract or through 5211 an integer mode punning. */ 5212 tree dst1, dst2; 5213 if (convert_optab_handler (vec_extract_optab, 5214 TYPE_MODE (TREE_TYPE (new_temp)), 5215 TYPE_MODE (vectype1)) 5216 != CODE_FOR_nothing) 5217 { 5218 /* Extract sub-vectors directly once vec_extract becomes 5219 a conversion optab. */ 5220 dst1 = make_ssa_name (vectype1); 5221 epilog_stmt 5222 = gimple_build_assign (dst1, BIT_FIELD_REF, 5223 build3 (BIT_FIELD_REF, vectype1, 5224 new_temp, TYPE_SIZE (vectype1), 5225 bitsize_int (0))); 5226 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5227 dst2 = make_ssa_name (vectype1); 5228 epilog_stmt 5229 = gimple_build_assign (dst2, BIT_FIELD_REF, 5230 build3 (BIT_FIELD_REF, vectype1, 5231 new_temp, TYPE_SIZE (vectype1), 5232 bitsize_int (sz * BITS_PER_UNIT))); 5233 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5234 } 5235 else 5236 { 5237 /* Extract via punning to appropriately sized integer mode 5238 vector. */ 5239 tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT, 5240 1); 5241 tree etype = build_vector_type (eltype, 2); 5242 gcc_assert (convert_optab_handler (vec_extract_optab, 5243 TYPE_MODE (etype), 5244 TYPE_MODE (eltype)) 5245 != CODE_FOR_nothing); 5246 tree tem = make_ssa_name (etype); 5247 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR, 5248 build1 (VIEW_CONVERT_EXPR, 5249 etype, new_temp)); 5250 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5251 new_temp = tem; 5252 tem = make_ssa_name (eltype); 5253 epilog_stmt 5254 = gimple_build_assign (tem, BIT_FIELD_REF, 5255 build3 (BIT_FIELD_REF, eltype, 5256 new_temp, TYPE_SIZE (eltype), 5257 bitsize_int (0))); 5258 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5259 dst1 = make_ssa_name (vectype1); 5260 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR, 5261 build1 (VIEW_CONVERT_EXPR, 5262 vectype1, tem)); 5263 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5264 tem = make_ssa_name (eltype); 5265 epilog_stmt 5266 = gimple_build_assign (tem, BIT_FIELD_REF, 5267 build3 (BIT_FIELD_REF, eltype, 5268 new_temp, TYPE_SIZE (eltype), 5269 bitsize_int (sz * BITS_PER_UNIT))); 5270 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5271 dst2 = make_ssa_name (vectype1); 5272 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR, 5273 build1 (VIEW_CONVERT_EXPR, 5274 vectype1, tem)); 5275 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5276 } 5277 5278 new_temp = make_ssa_name (vectype1); 5279 epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2); 5280 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5281 } 5282 5283 if (reduce_with_shift && !slp_reduc) 5284 { 5285 int element_bitsize = tree_to_uhwi (bitsize); 5286 /* Enforced by vectorizable_reduction, which disallows SLP reductions 5287 for variable-length vectors and also requires direct target support 5288 for loop reductions. */ 5289 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1)); 5290 int nelements = vec_size_in_bits / element_bitsize; 5291 vec_perm_builder sel; 5292 vec_perm_indices indices; 5293 5294 int elt_offset; 5295 5296 tree zero_vec = build_zero_cst (vectype1); 5297 /* Case 2: Create: 5298 for (offset = nelements/2; offset >= 1; offset/=2) 5299 { 5300 Create: va' = vec_shift <va, offset> 5301 Create: va = vop <va, va'> 5302 } */ 5303 5304 tree rhs; 5305 5306 if (dump_enabled_p ()) 5307 dump_printf_loc (MSG_NOTE, vect_location, 5308 "Reduce using vector shifts\n"); 5309 5310 mode1 = TYPE_MODE (vectype1); 5311 vec_dest = vect_create_destination_var (scalar_dest, vectype1); 5312 for (elt_offset = nelements / 2; 5313 elt_offset >= 1; 5314 elt_offset /= 2) 5315 { 5316 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel); 5317 indices.new_vector (sel, 2, nelements); 5318 tree mask = vect_gen_perm_mask_any (vectype1, indices); 5319 epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR, 5320 new_temp, zero_vec, mask); 5321 new_name = make_ssa_name (vec_dest, epilog_stmt); 5322 gimple_assign_set_lhs (epilog_stmt, new_name); 5323 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5324 5325 epilog_stmt = gimple_build_assign (vec_dest, code, new_name, 5326 new_temp); 5327 new_temp = make_ssa_name (vec_dest, epilog_stmt); 5328 gimple_assign_set_lhs (epilog_stmt, new_temp); 5329 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5330 } 5331 5332 /* 2.4 Extract the final scalar result. Create: 5333 s_out3 = extract_field <v_out2, bitpos> */ 5334 5335 if (dump_enabled_p ()) 5336 dump_printf_loc (MSG_NOTE, vect_location, 5337 "extract scalar result\n"); 5338 5339 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, 5340 bitsize, bitsize_zero_node); 5341 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs); 5342 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); 5343 gimple_assign_set_lhs (epilog_stmt, new_temp); 5344 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5345 scalar_results.safe_push (new_temp); 5346 } 5347 else 5348 { 5349 /* Case 3: Create: 5350 s = extract_field <v_out2, 0> 5351 for (offset = element_size; 5352 offset < vector_size; 5353 offset += element_size;) 5354 { 5355 Create: s' = extract_field <v_out2, offset> 5356 Create: s = op <s, s'> // For non SLP cases 5357 } */ 5358 5359 if (dump_enabled_p ()) 5360 dump_printf_loc (MSG_NOTE, vect_location, 5361 "Reduce using scalar code.\n"); 5362 5363 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1)); 5364 int element_bitsize = tree_to_uhwi (bitsize); 5365 FOR_EACH_VEC_ELT (new_phis, i, new_phi) 5366 { 5367 int bit_offset; 5368 if (gimple_code (new_phi) == GIMPLE_PHI) 5369 vec_temp = PHI_RESULT (new_phi); 5370 else 5371 vec_temp = gimple_assign_lhs (new_phi); 5372 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize, 5373 bitsize_zero_node); 5374 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs); 5375 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); 5376 gimple_assign_set_lhs (epilog_stmt, new_temp); 5377 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5378 5379 /* In SLP we don't need to apply reduction operation, so we just 5380 collect s' values in SCALAR_RESULTS. */ 5381 if (slp_reduc) 5382 scalar_results.safe_push (new_temp); 5383 5384 for (bit_offset = element_bitsize; 5385 bit_offset < vec_size_in_bits; 5386 bit_offset += element_bitsize) 5387 { 5388 tree bitpos = bitsize_int (bit_offset); 5389 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, 5390 bitsize, bitpos); 5391 5392 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs); 5393 new_name = make_ssa_name (new_scalar_dest, epilog_stmt); 5394 gimple_assign_set_lhs (epilog_stmt, new_name); 5395 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5396 5397 if (slp_reduc) 5398 { 5399 /* In SLP we don't need to apply reduction operation, so 5400 we just collect s' values in SCALAR_RESULTS. */ 5401 new_temp = new_name; 5402 scalar_results.safe_push (new_name); 5403 } 5404 else 5405 { 5406 epilog_stmt = gimple_build_assign (new_scalar_dest, code, 5407 new_name, new_temp); 5408 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); 5409 gimple_assign_set_lhs (epilog_stmt, new_temp); 5410 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5411 } 5412 } 5413 } 5414 5415 /* The only case where we need to reduce scalar results in SLP, is 5416 unrolling. If the size of SCALAR_RESULTS is greater than 5417 REDUC_GROUP_SIZE, we reduce them combining elements modulo 5418 REDUC_GROUP_SIZE. */ 5419 if (slp_reduc) 5420 { 5421 tree res, first_res, new_res; 5422 gimple *new_stmt; 5423 5424 /* Reduce multiple scalar results in case of SLP unrolling. */ 5425 for (j = group_size; scalar_results.iterate (j, &res); 5426 j++) 5427 { 5428 first_res = scalar_results[j % group_size]; 5429 new_stmt = gimple_build_assign (new_scalar_dest, code, 5430 first_res, res); 5431 new_res = make_ssa_name (new_scalar_dest, new_stmt); 5432 gimple_assign_set_lhs (new_stmt, new_res); 5433 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT); 5434 scalar_results[j % group_size] = new_res; 5435 } 5436 } 5437 else 5438 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */ 5439 scalar_results.safe_push (new_temp); 5440 } 5441 5442 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) 5443 == INTEGER_INDUC_COND_REDUCTION) 5444 && !operand_equal_p (initial_def, induc_val, 0)) 5445 { 5446 /* Earlier we set the initial value to be a vector if induc_val 5447 values. Check the result and if it is induc_val then replace 5448 with the original initial value, unless induc_val is 5449 the same as initial_def already. */ 5450 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp, 5451 induc_val); 5452 5453 tree tmp = make_ssa_name (new_scalar_dest); 5454 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare, 5455 initial_def, new_temp); 5456 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5457 scalar_results[0] = tmp; 5458 } 5459 } 5460 5461 vect_finalize_reduction: 5462 5463 if (double_reduc) 5464 loop = loop->inner; 5465 5466 /* 2.5 Adjust the final result by the initial value of the reduction 5467 variable. (When such adjustment is not needed, then 5468 'adjustment_def' is zero). For example, if code is PLUS we create: 5469 new_temp = loop_exit_def + adjustment_def */ 5470 5471 if (adjustment_def) 5472 { 5473 gcc_assert (!slp_reduc); 5474 if (nested_in_vect_loop) 5475 { 5476 new_phi = new_phis[0]; 5477 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE); 5478 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def); 5479 new_dest = vect_create_destination_var (scalar_dest, vectype); 5480 } 5481 else 5482 { 5483 new_temp = scalar_results[0]; 5484 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE); 5485 expr = build2 (code, scalar_type, new_temp, adjustment_def); 5486 new_dest = vect_create_destination_var (scalar_dest, scalar_type); 5487 } 5488 5489 epilog_stmt = gimple_build_assign (new_dest, expr); 5490 new_temp = make_ssa_name (new_dest, epilog_stmt); 5491 gimple_assign_set_lhs (epilog_stmt, new_temp); 5492 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5493 if (nested_in_vect_loop) 5494 { 5495 stmt_vec_info epilog_stmt_info = loop_vinfo->add_stmt (epilog_stmt); 5496 STMT_VINFO_RELATED_STMT (epilog_stmt_info) 5497 = STMT_VINFO_RELATED_STMT (loop_vinfo->lookup_stmt (new_phi)); 5498 5499 if (!double_reduc) 5500 scalar_results.quick_push (new_temp); 5501 else 5502 scalar_results[0] = new_temp; 5503 } 5504 else 5505 scalar_results[0] = new_temp; 5506 5507 new_phis[0] = epilog_stmt; 5508 } 5509 5510 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit 5511 phis with new adjusted scalar results, i.e., replace use <s_out0> 5512 with use <s_out4>. 5513 5514 Transform: 5515 loop_exit: 5516 s_out0 = phi <s_loop> # (scalar) EXIT_PHI 5517 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI 5518 v_out2 = reduce <v_out1> 5519 s_out3 = extract_field <v_out2, 0> 5520 s_out4 = adjust_result <s_out3> 5521 use <s_out0> 5522 use <s_out0> 5523 5524 into: 5525 5526 loop_exit: 5527 s_out0 = phi <s_loop> # (scalar) EXIT_PHI 5528 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI 5529 v_out2 = reduce <v_out1> 5530 s_out3 = extract_field <v_out2, 0> 5531 s_out4 = adjust_result <s_out3> 5532 use <s_out4> 5533 use <s_out4> */ 5534 5535 5536 /* In SLP reduction chain we reduce vector results into one vector if 5537 necessary, hence we set here REDUC_GROUP_SIZE to 1. SCALAR_DEST is the 5538 LHS of the last stmt in the reduction chain, since we are looking for 5539 the loop exit phi node. */ 5540 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)) 5541 { 5542 stmt_vec_info dest_stmt_info 5543 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]); 5544 scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt); 5545 group_size = 1; 5546 } 5547 5548 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in 5549 case that REDUC_GROUP_SIZE is greater than vectorization factor). 5550 Therefore, we need to match SCALAR_RESULTS with corresponding statements. 5551 The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results 5552 correspond to the first vector stmt, etc. 5553 (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)). */ 5554 if (group_size > new_phis.length ()) 5555 { 5556 ratio = group_size / new_phis.length (); 5557 gcc_assert (!(group_size % new_phis.length ())); 5558 } 5559 else 5560 ratio = 1; 5561 5562 stmt_vec_info epilog_stmt_info = NULL; 5563 for (k = 0; k < group_size; k++) 5564 { 5565 if (k % ratio == 0) 5566 { 5567 epilog_stmt_info = loop_vinfo->lookup_stmt (new_phis[k / ratio]); 5568 reduction_phi_info = reduction_phis[k / ratio]; 5569 if (double_reduc) 5570 inner_phi = inner_phis[k / ratio]; 5571 } 5572 5573 if (slp_reduc) 5574 { 5575 stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k]; 5576 5577 orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info); 5578 /* SLP statements can't participate in patterns. */ 5579 gcc_assert (!orig_stmt_info); 5580 scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt); 5581 } 5582 5583 phis.create (3); 5584 /* Find the loop-closed-use at the loop exit of the original scalar 5585 result. (The reduction result is expected to have two immediate uses - 5586 one at the latch block, and one at the loop exit). */ 5587 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest) 5588 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))) 5589 && !is_gimple_debug (USE_STMT (use_p))) 5590 phis.safe_push (USE_STMT (use_p)); 5591 5592 /* While we expect to have found an exit_phi because of loop-closed-ssa 5593 form we can end up without one if the scalar cycle is dead. */ 5594 5595 FOR_EACH_VEC_ELT (phis, i, exit_phi) 5596 { 5597 if (outer_loop) 5598 { 5599 stmt_vec_info exit_phi_vinfo 5600 = loop_vinfo->lookup_stmt (exit_phi); 5601 gphi *vect_phi; 5602 5603 if (double_reduc) 5604 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi; 5605 else 5606 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt_info; 5607 if (!double_reduc 5608 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo) 5609 != vect_double_reduction_def) 5610 continue; 5611 5612 /* Handle double reduction: 5613 5614 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop) 5615 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop) 5616 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop) 5617 stmt4: s2 = phi <s4> - double reduction stmt (outer loop) 5618 5619 At that point the regular reduction (stmt2 and stmt3) is 5620 already vectorized, as well as the exit phi node, stmt4. 5621 Here we vectorize the phi node of double reduction, stmt1, and 5622 update all relevant statements. */ 5623 5624 /* Go through all the uses of s2 to find double reduction phi 5625 node, i.e., stmt1 above. */ 5626 orig_name = PHI_RESULT (exit_phi); 5627 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name) 5628 { 5629 stmt_vec_info use_stmt_vinfo; 5630 tree vect_phi_init, preheader_arg, vect_phi_res; 5631 basic_block bb = gimple_bb (use_stmt); 5632 5633 /* Check that USE_STMT is really double reduction phi 5634 node. */ 5635 if (gimple_code (use_stmt) != GIMPLE_PHI 5636 || gimple_phi_num_args (use_stmt) != 2 5637 || bb->loop_father != outer_loop) 5638 continue; 5639 use_stmt_vinfo = loop_vinfo->lookup_stmt (use_stmt); 5640 if (!use_stmt_vinfo 5641 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo) 5642 != vect_double_reduction_def) 5643 continue; 5644 5645 /* Create vector phi node for double reduction: 5646 vs1 = phi <vs0, vs2> 5647 vs1 was created previously in this function by a call to 5648 vect_get_vec_def_for_operand and is stored in 5649 vec_initial_def; 5650 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI; 5651 vs0 is created here. */ 5652 5653 /* Create vector phi node. */ 5654 vect_phi = create_phi_node (vec_initial_def, bb); 5655 loop_vec_info_for_loop (outer_loop)->add_stmt (vect_phi); 5656 5657 /* Create vs0 - initial def of the double reduction phi. */ 5658 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt, 5659 loop_preheader_edge (outer_loop)); 5660 vect_phi_init = get_initial_def_for_reduction 5661 (stmt_info, preheader_arg, NULL); 5662 5663 /* Update phi node arguments with vs0 and vs2. */ 5664 add_phi_arg (vect_phi, vect_phi_init, 5665 loop_preheader_edge (outer_loop), 5666 UNKNOWN_LOCATION); 5667 add_phi_arg (vect_phi, PHI_RESULT (inner_phi->stmt), 5668 loop_latch_edge (outer_loop), UNKNOWN_LOCATION); 5669 if (dump_enabled_p ()) 5670 dump_printf_loc (MSG_NOTE, vect_location, 5671 "created double reduction phi node: %G", 5672 vect_phi); 5673 5674 vect_phi_res = PHI_RESULT (vect_phi); 5675 5676 /* Replace the use, i.e., set the correct vs1 in the regular 5677 reduction phi node. FORNOW, NCOPIES is always 1, so the 5678 loop is redundant. */ 5679 stmt_vec_info use_info = reduction_phi_info; 5680 for (j = 0; j < ncopies; j++) 5681 { 5682 edge pr_edge = loop_preheader_edge (loop); 5683 SET_PHI_ARG_DEF (as_a <gphi *> (use_info->stmt), 5684 pr_edge->dest_idx, vect_phi_res); 5685 use_info = STMT_VINFO_RELATED_STMT (use_info); 5686 } 5687 } 5688 } 5689 } 5690 5691 phis.release (); 5692 if (nested_in_vect_loop) 5693 { 5694 if (double_reduc) 5695 loop = outer_loop; 5696 else 5697 continue; 5698 } 5699 5700 phis.create (3); 5701 /* Find the loop-closed-use at the loop exit of the original scalar 5702 result. (The reduction result is expected to have two immediate uses, 5703 one at the latch block, and one at the loop exit). For double 5704 reductions we are looking for exit phis of the outer loop. */ 5705 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest) 5706 { 5707 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))) 5708 { 5709 if (!is_gimple_debug (USE_STMT (use_p))) 5710 phis.safe_push (USE_STMT (use_p)); 5711 } 5712 else 5713 { 5714 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI) 5715 { 5716 tree phi_res = PHI_RESULT (USE_STMT (use_p)); 5717 5718 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res) 5719 { 5720 if (!flow_bb_inside_loop_p (loop, 5721 gimple_bb (USE_STMT (phi_use_p))) 5722 && !is_gimple_debug (USE_STMT (phi_use_p))) 5723 phis.safe_push (USE_STMT (phi_use_p)); 5724 } 5725 } 5726 } 5727 } 5728 5729 FOR_EACH_VEC_ELT (phis, i, exit_phi) 5730 { 5731 /* Replace the uses: */ 5732 orig_name = PHI_RESULT (exit_phi); 5733 scalar_result = scalar_results[k]; 5734 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name) 5735 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter) 5736 SET_USE (use_p, scalar_result); 5737 } 5738 5739 phis.release (); 5740 } 5741 } 5742 5743 /* Return a vector of type VECTYPE that is equal to the vector select 5744 operation "MASK ? VEC : IDENTITY". Insert the select statements 5745 before GSI. */ 5746 5747 static tree 5748 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype, 5749 tree vec, tree identity) 5750 { 5751 tree cond = make_temp_ssa_name (vectype, NULL, "cond"); 5752 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR, 5753 mask, vec, identity); 5754 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); 5755 return cond; 5756 } 5757 5758 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right 5759 order, starting with LHS. Insert the extraction statements before GSI and 5760 associate the new scalar SSA names with variable SCALAR_DEST. 5761 Return the SSA name for the result. */ 5762 5763 static tree 5764 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest, 5765 tree_code code, tree lhs, tree vector_rhs) 5766 { 5767 tree vectype = TREE_TYPE (vector_rhs); 5768 tree scalar_type = TREE_TYPE (vectype); 5769 tree bitsize = TYPE_SIZE (scalar_type); 5770 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype)); 5771 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize); 5772 5773 for (unsigned HOST_WIDE_INT bit_offset = 0; 5774 bit_offset < vec_size_in_bits; 5775 bit_offset += element_bitsize) 5776 { 5777 tree bitpos = bitsize_int (bit_offset); 5778 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs, 5779 bitsize, bitpos); 5780 5781 gassign *stmt = gimple_build_assign (scalar_dest, rhs); 5782 rhs = make_ssa_name (scalar_dest, stmt); 5783 gimple_assign_set_lhs (stmt, rhs); 5784 gsi_insert_before (gsi, stmt, GSI_SAME_STMT); 5785 5786 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs); 5787 tree new_name = make_ssa_name (scalar_dest, stmt); 5788 gimple_assign_set_lhs (stmt, new_name); 5789 gsi_insert_before (gsi, stmt, GSI_SAME_STMT); 5790 lhs = new_name; 5791 } 5792 return lhs; 5793 } 5794 5795 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the 5796 statement that sets the live-out value. REDUC_DEF_STMT is the phi 5797 statement. CODE is the operation performed by STMT_INFO and OPS are 5798 its scalar operands. REDUC_INDEX is the index of the operand in 5799 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that 5800 implements in-order reduction, or IFN_LAST if we should open-code it. 5801 VECTYPE_IN is the type of the vector input. MASKS specifies the masks 5802 that should be used to control the operation in a fully-masked loop. */ 5803 5804 static bool 5805 vectorize_fold_left_reduction (stmt_vec_info stmt_info, 5806 gimple_stmt_iterator *gsi, 5807 stmt_vec_info *vec_stmt, slp_tree slp_node, 5808 gimple *reduc_def_stmt, 5809 tree_code code, internal_fn reduc_fn, 5810 tree ops[3], tree vectype_in, 5811 int reduc_index, vec_loop_masks *masks) 5812 { 5813 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 5814 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 5815 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info); 5816 stmt_vec_info new_stmt_info = NULL; 5817 5818 int ncopies; 5819 if (slp_node) 5820 ncopies = 1; 5821 else 5822 ncopies = vect_get_num_copies (loop_vinfo, vectype_in); 5823 5824 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info)); 5825 gcc_assert (ncopies == 1); 5826 gcc_assert (TREE_CODE_LENGTH (code) == binary_op); 5827 gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1)); 5828 gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) 5829 == FOLD_LEFT_REDUCTION); 5830 5831 if (slp_node) 5832 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out), 5833 TYPE_VECTOR_SUBPARTS (vectype_in))); 5834 5835 tree op0 = ops[1 - reduc_index]; 5836 5837 int group_size = 1; 5838 stmt_vec_info scalar_dest_def_info; 5839 auto_vec<tree> vec_oprnds0; 5840 if (slp_node) 5841 { 5842 auto_vec<vec<tree> > vec_defs (2); 5843 auto_vec<tree> sops(2); 5844 sops.quick_push (ops[0]); 5845 sops.quick_push (ops[1]); 5846 vect_get_slp_defs (sops, slp_node, &vec_defs); 5847 vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]); 5848 vec_defs[0].release (); 5849 vec_defs[1].release (); 5850 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length (); 5851 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]; 5852 } 5853 else 5854 { 5855 tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt_info); 5856 vec_oprnds0.create (1); 5857 vec_oprnds0.quick_push (loop_vec_def0); 5858 scalar_dest_def_info = stmt_info; 5859 } 5860 5861 tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt); 5862 tree scalar_type = TREE_TYPE (scalar_dest); 5863 tree reduc_var = gimple_phi_result (reduc_def_stmt); 5864 5865 int vec_num = vec_oprnds0.length (); 5866 gcc_assert (vec_num == 1 || slp_node); 5867 tree vec_elem_type = TREE_TYPE (vectype_out); 5868 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type)); 5869 5870 tree vector_identity = NULL_TREE; 5871 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) 5872 vector_identity = build_zero_cst (vectype_out); 5873 5874 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL); 5875 int i; 5876 tree def0; 5877 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0) 5878 { 5879 gimple *new_stmt; 5880 tree mask = NULL_TREE; 5881 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) 5882 mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i); 5883 5884 /* Handle MINUS by adding the negative. */ 5885 if (reduc_fn != IFN_LAST && code == MINUS_EXPR) 5886 { 5887 tree negated = make_ssa_name (vectype_out); 5888 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0); 5889 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); 5890 def0 = negated; 5891 } 5892 5893 if (mask) 5894 def0 = merge_with_identity (gsi, mask, vectype_out, def0, 5895 vector_identity); 5896 5897 /* On the first iteration the input is simply the scalar phi 5898 result, and for subsequent iterations it is the output of 5899 the preceding operation. */ 5900 if (reduc_fn != IFN_LAST) 5901 { 5902 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0); 5903 /* For chained SLP reductions the output of the previous reduction 5904 operation serves as the input of the next. For the final statement 5905 the output cannot be a temporary - we reuse the original 5906 scalar destination of the last statement. */ 5907 if (i != vec_num - 1) 5908 { 5909 gimple_set_lhs (new_stmt, scalar_dest_var); 5910 reduc_var = make_ssa_name (scalar_dest_var, new_stmt); 5911 gimple_set_lhs (new_stmt, reduc_var); 5912 } 5913 } 5914 else 5915 { 5916 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code, 5917 reduc_var, def0); 5918 new_stmt = SSA_NAME_DEF_STMT (reduc_var); 5919 /* Remove the statement, so that we can use the same code paths 5920 as for statements that we've just created. */ 5921 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt); 5922 gsi_remove (&tmp_gsi, true); 5923 } 5924 5925 if (i == vec_num - 1) 5926 { 5927 gimple_set_lhs (new_stmt, scalar_dest); 5928 new_stmt_info = vect_finish_replace_stmt (scalar_dest_def_info, 5929 new_stmt); 5930 } 5931 else 5932 new_stmt_info = vect_finish_stmt_generation (scalar_dest_def_info, 5933 new_stmt, gsi); 5934 5935 if (slp_node) 5936 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info); 5937 } 5938 5939 if (!slp_node) 5940 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info; 5941 5942 return true; 5943 } 5944 5945 /* Function is_nonwrapping_integer_induction. 5946 5947 Check if STMT_VINO (which is part of loop LOOP) both increments and 5948 does not cause overflow. */ 5949 5950 static bool 5951 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, struct loop *loop) 5952 { 5953 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt); 5954 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo); 5955 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo); 5956 tree lhs_type = TREE_TYPE (gimple_phi_result (phi)); 5957 widest_int ni, max_loop_value, lhs_max; 5958 wi::overflow_type overflow = wi::OVF_NONE; 5959 5960 /* Make sure the loop is integer based. */ 5961 if (TREE_CODE (base) != INTEGER_CST 5962 || TREE_CODE (step) != INTEGER_CST) 5963 return false; 5964 5965 /* Check that the max size of the loop will not wrap. */ 5966 5967 if (TYPE_OVERFLOW_UNDEFINED (lhs_type)) 5968 return true; 5969 5970 if (! max_stmt_executions (loop, &ni)) 5971 return false; 5972 5973 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type), 5974 &overflow); 5975 if (overflow) 5976 return false; 5977 5978 max_loop_value = wi::add (wi::to_widest (base), max_loop_value, 5979 TYPE_SIGN (lhs_type), &overflow); 5980 if (overflow) 5981 return false; 5982 5983 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type)) 5984 <= TYPE_PRECISION (lhs_type)); 5985 } 5986 5987 /* Function vectorizable_reduction. 5988 5989 Check if STMT_INFO performs a reduction operation that can be vectorized. 5990 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized 5991 stmt to replace it, put it in VEC_STMT, and insert it at GSI. 5992 Return true if STMT_INFO is vectorizable in this way. 5993 5994 This function also handles reduction idioms (patterns) that have been 5995 recognized in advance during vect_pattern_recog. In this case, STMT_INFO 5996 may be of this form: 5997 X = pattern_expr (arg0, arg1, ..., X) 5998 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original 5999 sequence that had been detected and replaced by the pattern-stmt 6000 (STMT_INFO). 6001 6002 This function also handles reduction of condition expressions, for example: 6003 for (int i = 0; i < N; i++) 6004 if (a[i] < value) 6005 last = a[i]; 6006 This is handled by vectorising the loop and creating an additional vector 6007 containing the loop indexes for which "a[i] < value" was true. In the 6008 function epilogue this is reduced to a single max value and then used to 6009 index into the vector of results. 6010 6011 In some cases of reduction patterns, the type of the reduction variable X is 6012 different than the type of the other arguments of STMT_INFO. 6013 In such cases, the vectype that is used when transforming STMT_INFO into 6014 a vector stmt is different than the vectype that is used to determine the 6015 vectorization factor, because it consists of a different number of elements 6016 than the actual number of elements that are being operated upon in parallel. 6017 6018 For example, consider an accumulation of shorts into an int accumulator. 6019 On some targets it's possible to vectorize this pattern operating on 8 6020 shorts at a time (hence, the vectype for purposes of determining the 6021 vectorization factor should be V8HI); on the other hand, the vectype that 6022 is used to create the vector form is actually V4SI (the type of the result). 6023 6024 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that 6025 indicates what is the actual level of parallelism (V8HI in the example), so 6026 that the right vectorization factor would be derived. This vectype 6027 corresponds to the type of arguments to the reduction stmt, and should *NOT* 6028 be used to create the vectorized stmt. The right vectype for the vectorized 6029 stmt is obtained from the type of the result X: 6030 get_vectype_for_scalar_type (TREE_TYPE (X)) 6031 6032 This means that, contrary to "regular" reductions (or "regular" stmts in 6033 general), the following equation: 6034 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X)) 6035 does *NOT* necessarily hold for reduction patterns. */ 6036 6037 bool 6038 vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, 6039 stmt_vec_info *vec_stmt, slp_tree slp_node, 6040 slp_instance slp_node_instance, 6041 stmt_vector_for_cost *cost_vec) 6042 { 6043 tree vec_dest; 6044 tree scalar_dest; 6045 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info); 6046 tree vectype_in = NULL_TREE; 6047 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 6048 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 6049 enum tree_code code, orig_code; 6050 internal_fn reduc_fn; 6051 machine_mode vec_mode; 6052 int op_type; 6053 optab optab; 6054 tree new_temp = NULL_TREE; 6055 enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type; 6056 stmt_vec_info cond_stmt_vinfo = NULL; 6057 enum tree_code cond_reduc_op_code = ERROR_MARK; 6058 tree scalar_type; 6059 bool is_simple_use; 6060 int i; 6061 int ncopies; 6062 int epilog_copies; 6063 stmt_vec_info prev_stmt_info, prev_phi_info; 6064 bool single_defuse_cycle = false; 6065 stmt_vec_info new_stmt_info = NULL; 6066 int j; 6067 tree ops[3]; 6068 enum vect_def_type dts[3]; 6069 bool nested_cycle = false, found_nested_cycle_def = false; 6070 bool double_reduc = false; 6071 basic_block def_bb; 6072 struct loop * def_stmt_loop; 6073 tree def_arg; 6074 auto_vec<tree> vec_oprnds0; 6075 auto_vec<tree> vec_oprnds1; 6076 auto_vec<tree> vec_oprnds2; 6077 auto_vec<tree> vect_defs; 6078 auto_vec<stmt_vec_info> phis; 6079 int vec_num; 6080 tree def0, tem; 6081 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE; 6082 tree cond_reduc_val = NULL_TREE; 6083 6084 /* Make sure it was already recognized as a reduction computation. */ 6085 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def 6086 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle) 6087 return false; 6088 6089 if (nested_in_vect_loop_p (loop, stmt_info)) 6090 { 6091 loop = loop->inner; 6092 nested_cycle = true; 6093 } 6094 6095 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)) 6096 gcc_assert (slp_node 6097 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info); 6098 6099 if (gphi *phi = dyn_cast <gphi *> (stmt_info->stmt)) 6100 { 6101 tree phi_result = gimple_phi_result (phi); 6102 /* Analysis is fully done on the reduction stmt invocation. */ 6103 if (! vec_stmt) 6104 { 6105 if (slp_node) 6106 slp_node_instance->reduc_phis = slp_node; 6107 6108 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type; 6109 return true; 6110 } 6111 6112 if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION) 6113 /* Leave the scalar phi in place. Note that checking 6114 STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works 6115 for reductions involving a single statement. */ 6116 return true; 6117 6118 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info); 6119 reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info); 6120 6121 if (STMT_VINFO_VEC_REDUCTION_TYPE (reduc_stmt_info) 6122 == EXTRACT_LAST_REDUCTION) 6123 /* Leave the scalar phi in place. */ 6124 return true; 6125 6126 gassign *reduc_stmt = as_a <gassign *> (reduc_stmt_info->stmt); 6127 code = gimple_assign_rhs_code (reduc_stmt); 6128 for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k) 6129 { 6130 tree op = gimple_op (reduc_stmt, k); 6131 if (op == phi_result) 6132 continue; 6133 if (k == 1 && code == COND_EXPR) 6134 continue; 6135 bool is_simple_use = vect_is_simple_use (op, loop_vinfo, &dt); 6136 gcc_assert (is_simple_use); 6137 if (dt == vect_constant_def || dt == vect_external_def) 6138 continue; 6139 if (!vectype_in 6140 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in))) 6141 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op))))) 6142 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op)); 6143 break; 6144 } 6145 /* For a nested cycle we might end up with an operation like 6146 phi_result * phi_result. */ 6147 if (!vectype_in) 6148 vectype_in = STMT_VINFO_VECTYPE (stmt_info); 6149 gcc_assert (vectype_in); 6150 6151 if (slp_node) 6152 ncopies = 1; 6153 else 6154 ncopies = vect_get_num_copies (loop_vinfo, vectype_in); 6155 6156 stmt_vec_info use_stmt_info; 6157 if (ncopies > 1 6158 && STMT_VINFO_RELEVANT (reduc_stmt_info) <= vect_used_only_live 6159 && (use_stmt_info = loop_vinfo->lookup_single_use (phi_result)) 6160 && vect_stmt_to_vectorize (use_stmt_info) == reduc_stmt_info) 6161 single_defuse_cycle = true; 6162 6163 /* Create the destination vector */ 6164 scalar_dest = gimple_assign_lhs (reduc_stmt); 6165 vec_dest = vect_create_destination_var (scalar_dest, vectype_out); 6166 6167 if (slp_node) 6168 /* The size vect_schedule_slp_instance computes is off for us. */ 6169 vec_num = vect_get_num_vectors 6170 (LOOP_VINFO_VECT_FACTOR (loop_vinfo) 6171 * SLP_TREE_SCALAR_STMTS (slp_node).length (), 6172 vectype_in); 6173 else 6174 vec_num = 1; 6175 6176 /* Generate the reduction PHIs upfront. */ 6177 prev_phi_info = NULL; 6178 for (j = 0; j < ncopies; j++) 6179 { 6180 if (j == 0 || !single_defuse_cycle) 6181 { 6182 for (i = 0; i < vec_num; i++) 6183 { 6184 /* Create the reduction-phi that defines the reduction 6185 operand. */ 6186 gimple *new_phi = create_phi_node (vec_dest, loop->header); 6187 stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi); 6188 6189 if (slp_node) 6190 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info); 6191 else 6192 { 6193 if (j == 0) 6194 STMT_VINFO_VEC_STMT (stmt_info) 6195 = *vec_stmt = new_phi_info; 6196 else 6197 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info; 6198 prev_phi_info = new_phi_info; 6199 } 6200 } 6201 } 6202 } 6203 6204 return true; 6205 } 6206 6207 /* 1. Is vectorizable reduction? */ 6208 /* Not supportable if the reduction variable is used in the loop, unless 6209 it's a reduction chain. */ 6210 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer 6211 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)) 6212 return false; 6213 6214 /* Reductions that are not used even in an enclosing outer-loop, 6215 are expected to be "live" (used out of the loop). */ 6216 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope 6217 && !STMT_VINFO_LIVE_P (stmt_info)) 6218 return false; 6219 6220 /* 2. Has this been recognized as a reduction pattern? 6221 6222 Check if STMT represents a pattern that has been recognized 6223 in earlier analysis stages. For stmts that represent a pattern, 6224 the STMT_VINFO_RELATED_STMT field records the last stmt in 6225 the original sequence that constitutes the pattern. */ 6226 6227 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info); 6228 if (orig_stmt_info) 6229 { 6230 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)); 6231 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info)); 6232 } 6233 6234 /* 3. Check the operands of the operation. The first operands are defined 6235 inside the loop body. The last operand is the reduction variable, 6236 which is defined by the loop-header-phi. */ 6237 6238 gassign *stmt = as_a <gassign *> (stmt_info->stmt); 6239 6240 /* Flatten RHS. */ 6241 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt))) 6242 { 6243 case GIMPLE_BINARY_RHS: 6244 code = gimple_assign_rhs_code (stmt); 6245 op_type = TREE_CODE_LENGTH (code); 6246 gcc_assert (op_type == binary_op); 6247 ops[0] = gimple_assign_rhs1 (stmt); 6248 ops[1] = gimple_assign_rhs2 (stmt); 6249 break; 6250 6251 case GIMPLE_TERNARY_RHS: 6252 code = gimple_assign_rhs_code (stmt); 6253 op_type = TREE_CODE_LENGTH (code); 6254 gcc_assert (op_type == ternary_op); 6255 ops[0] = gimple_assign_rhs1 (stmt); 6256 ops[1] = gimple_assign_rhs2 (stmt); 6257 ops[2] = gimple_assign_rhs3 (stmt); 6258 break; 6259 6260 case GIMPLE_UNARY_RHS: 6261 return false; 6262 6263 default: 6264 gcc_unreachable (); 6265 } 6266 6267 if (code == COND_EXPR && slp_node) 6268 return false; 6269 6270 scalar_dest = gimple_assign_lhs (stmt); 6271 scalar_type = TREE_TYPE (scalar_dest); 6272 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type) 6273 && !SCALAR_FLOAT_TYPE_P (scalar_type)) 6274 return false; 6275 6276 /* Do not try to vectorize bit-precision reductions. */ 6277 if (!type_has_mode_precision_p (scalar_type)) 6278 return false; 6279 6280 /* All uses but the last are expected to be defined in the loop. 6281 The last use is the reduction variable. In case of nested cycle this 6282 assumption is not true: we use reduc_index to record the index of the 6283 reduction variable. */ 6284 stmt_vec_info reduc_def_info; 6285 if (orig_stmt_info) 6286 reduc_def_info = STMT_VINFO_REDUC_DEF (orig_stmt_info); 6287 else 6288 reduc_def_info = STMT_VINFO_REDUC_DEF (stmt_info); 6289 gcc_assert (reduc_def_info); 6290 gphi *reduc_def_phi = as_a <gphi *> (reduc_def_info->stmt); 6291 tree reduc_def = PHI_RESULT (reduc_def_phi); 6292 int reduc_index = -1; 6293 for (i = 0; i < op_type; i++) 6294 { 6295 /* The condition of COND_EXPR is checked in vectorizable_condition(). */ 6296 if (i == 0 && code == COND_EXPR) 6297 continue; 6298 6299 stmt_vec_info def_stmt_info; 6300 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, &dts[i], &tem, 6301 &def_stmt_info); 6302 dt = dts[i]; 6303 gcc_assert (is_simple_use); 6304 if (dt == vect_reduction_def 6305 && ops[i] == reduc_def) 6306 { 6307 reduc_index = i; 6308 continue; 6309 } 6310 else if (tem) 6311 { 6312 /* To properly compute ncopies we are interested in the widest 6313 input type in case we're looking at a widening accumulation. */ 6314 if (!vectype_in 6315 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in))) 6316 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem))))) 6317 vectype_in = tem; 6318 } 6319 6320 if (dt != vect_internal_def 6321 && dt != vect_external_def 6322 && dt != vect_constant_def 6323 && dt != vect_induction_def 6324 && !(dt == vect_nested_cycle && nested_cycle)) 6325 return false; 6326 6327 if (dt == vect_nested_cycle 6328 && ops[i] == reduc_def) 6329 { 6330 found_nested_cycle_def = true; 6331 reduc_index = i; 6332 } 6333 6334 if (i == 1 && code == COND_EXPR) 6335 { 6336 /* Record how value of COND_EXPR is defined. */ 6337 if (dt == vect_constant_def) 6338 { 6339 cond_reduc_dt = dt; 6340 cond_reduc_val = ops[i]; 6341 } 6342 if (dt == vect_induction_def 6343 && def_stmt_info 6344 && is_nonwrapping_integer_induction (def_stmt_info, loop)) 6345 { 6346 cond_reduc_dt = dt; 6347 cond_stmt_vinfo = def_stmt_info; 6348 } 6349 } 6350 } 6351 6352 if (!vectype_in) 6353 vectype_in = vectype_out; 6354 6355 /* When vectorizing a reduction chain w/o SLP the reduction PHI is not 6356 directy used in stmt. */ 6357 if (reduc_index == -1) 6358 { 6359 if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION) 6360 { 6361 if (dump_enabled_p ()) 6362 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6363 "in-order reduction chain without SLP.\n"); 6364 return false; 6365 } 6366 } 6367 6368 if (!(reduc_index == -1 6369 || dts[reduc_index] == vect_reduction_def 6370 || dts[reduc_index] == vect_nested_cycle 6371 || ((dts[reduc_index] == vect_internal_def 6372 || dts[reduc_index] == vect_external_def 6373 || dts[reduc_index] == vect_constant_def 6374 || dts[reduc_index] == vect_induction_def) 6375 && nested_cycle && found_nested_cycle_def))) 6376 { 6377 /* For pattern recognized stmts, orig_stmt might be a reduction, 6378 but some helper statements for the pattern might not, or 6379 might be COND_EXPRs with reduction uses in the condition. */ 6380 gcc_assert (orig_stmt_info); 6381 return false; 6382 } 6383 6384 /* PHIs should not participate in patterns. */ 6385 gcc_assert (!STMT_VINFO_RELATED_STMT (reduc_def_info)); 6386 enum vect_reduction_type v_reduc_type 6387 = STMT_VINFO_REDUC_TYPE (reduc_def_info); 6388 stmt_vec_info tmp = STMT_VINFO_REDUC_DEF (reduc_def_info); 6389 6390 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type; 6391 /* If we have a condition reduction, see if we can simplify it further. */ 6392 if (v_reduc_type == COND_REDUCTION) 6393 { 6394 /* TODO: We can't yet handle reduction chains, since we need to treat 6395 each COND_EXPR in the chain specially, not just the last one. 6396 E.g. for: 6397 6398 x_1 = PHI <x_3, ...> 6399 x_2 = a_2 ? ... : x_1; 6400 x_3 = a_3 ? ... : x_2; 6401 6402 we're interested in the last element in x_3 for which a_2 || a_3 6403 is true, whereas the current reduction chain handling would 6404 vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3 6405 as a reduction operation. */ 6406 if (reduc_index == -1) 6407 { 6408 if (dump_enabled_p ()) 6409 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6410 "conditional reduction chains not supported\n"); 6411 return false; 6412 } 6413 6414 /* vect_is_simple_reduction ensured that operand 2 is the 6415 loop-carried operand. */ 6416 gcc_assert (reduc_index == 2); 6417 6418 /* Loop peeling modifies initial value of reduction PHI, which 6419 makes the reduction stmt to be transformed different to the 6420 original stmt analyzed. We need to record reduction code for 6421 CONST_COND_REDUCTION type reduction at analyzing stage, thus 6422 it can be used directly at transform stage. */ 6423 if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR 6424 || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR) 6425 { 6426 /* Also set the reduction type to CONST_COND_REDUCTION. */ 6427 gcc_assert (cond_reduc_dt == vect_constant_def); 6428 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION; 6429 } 6430 else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST, 6431 vectype_in, OPTIMIZE_FOR_SPEED)) 6432 { 6433 if (dump_enabled_p ()) 6434 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6435 "optimizing condition reduction with" 6436 " FOLD_EXTRACT_LAST.\n"); 6437 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION; 6438 } 6439 else if (cond_reduc_dt == vect_induction_def) 6440 { 6441 tree base 6442 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo); 6443 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo); 6444 6445 gcc_assert (TREE_CODE (base) == INTEGER_CST 6446 && TREE_CODE (step) == INTEGER_CST); 6447 cond_reduc_val = NULL_TREE; 6448 tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo)); 6449 if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base))) 6450 ; 6451 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR 6452 above base; punt if base is the minimum value of the type for 6453 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */ 6454 else if (tree_int_cst_sgn (step) == -1) 6455 { 6456 cond_reduc_op_code = MIN_EXPR; 6457 if (tree_int_cst_sgn (base) == -1) 6458 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0); 6459 else if (tree_int_cst_lt (base, 6460 TYPE_MAX_VALUE (TREE_TYPE (base)))) 6461 cond_reduc_val 6462 = int_const_binop (PLUS_EXPR, base, integer_one_node); 6463 } 6464 else 6465 { 6466 cond_reduc_op_code = MAX_EXPR; 6467 if (tree_int_cst_sgn (base) == 1) 6468 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0); 6469 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)), 6470 base)) 6471 cond_reduc_val 6472 = int_const_binop (MINUS_EXPR, base, integer_one_node); 6473 } 6474 if (cond_reduc_val) 6475 { 6476 if (dump_enabled_p ()) 6477 dump_printf_loc (MSG_NOTE, vect_location, 6478 "condition expression based on " 6479 "integer induction.\n"); 6480 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) 6481 = INTEGER_INDUC_COND_REDUCTION; 6482 } 6483 } 6484 else if (cond_reduc_dt == vect_constant_def) 6485 { 6486 enum vect_def_type cond_initial_dt; 6487 gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]); 6488 tree cond_initial_val 6489 = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop)); 6490 6491 gcc_assert (cond_reduc_val != NULL_TREE); 6492 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt); 6493 if (cond_initial_dt == vect_constant_def 6494 && types_compatible_p (TREE_TYPE (cond_initial_val), 6495 TREE_TYPE (cond_reduc_val))) 6496 { 6497 tree e = fold_binary (LE_EXPR, boolean_type_node, 6498 cond_initial_val, cond_reduc_val); 6499 if (e && (integer_onep (e) || integer_zerop (e))) 6500 { 6501 if (dump_enabled_p ()) 6502 dump_printf_loc (MSG_NOTE, vect_location, 6503 "condition expression based on " 6504 "compile time constant.\n"); 6505 /* Record reduction code at analysis stage. */ 6506 STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) 6507 = integer_onep (e) ? MAX_EXPR : MIN_EXPR; 6508 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) 6509 = CONST_COND_REDUCTION; 6510 } 6511 } 6512 } 6513 } 6514 6515 if (orig_stmt_info) 6516 gcc_assert (tmp == orig_stmt_info 6517 || REDUC_GROUP_FIRST_ELEMENT (tmp) == orig_stmt_info); 6518 else 6519 /* We changed STMT to be the first stmt in reduction chain, hence we 6520 check that in this case the first element in the chain is STMT. */ 6521 gcc_assert (tmp == stmt_info 6522 || REDUC_GROUP_FIRST_ELEMENT (tmp) == stmt_info); 6523 6524 if (STMT_VINFO_LIVE_P (reduc_def_info)) 6525 return false; 6526 6527 if (slp_node) 6528 ncopies = 1; 6529 else 6530 ncopies = vect_get_num_copies (loop_vinfo, vectype_in); 6531 6532 gcc_assert (ncopies >= 1); 6533 6534 vec_mode = TYPE_MODE (vectype_in); 6535 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out); 6536 6537 if (nested_cycle) 6538 { 6539 def_bb = gimple_bb (reduc_def_phi); 6540 def_stmt_loop = def_bb->loop_father; 6541 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi, 6542 loop_preheader_edge (def_stmt_loop)); 6543 stmt_vec_info def_arg_stmt_info = loop_vinfo->lookup_def (def_arg); 6544 if (def_arg_stmt_info 6545 && (STMT_VINFO_DEF_TYPE (def_arg_stmt_info) 6546 == vect_double_reduction_def)) 6547 double_reduc = true; 6548 } 6549 6550 vect_reduction_type reduction_type 6551 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info); 6552 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION) 6553 && ncopies > 1) 6554 { 6555 if (dump_enabled_p ()) 6556 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6557 "multiple types in double reduction or condition " 6558 "reduction.\n"); 6559 return false; 6560 } 6561 6562 if (code == COND_EXPR) 6563 { 6564 /* Only call during the analysis stage, otherwise we'll lose 6565 STMT_VINFO_TYPE. */ 6566 if (!vec_stmt && !vectorizable_condition (stmt_info, gsi, NULL, 6567 true, NULL, cost_vec)) 6568 { 6569 if (dump_enabled_p ()) 6570 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6571 "unsupported condition in reduction\n"); 6572 return false; 6573 } 6574 } 6575 else if (code == LSHIFT_EXPR || code == RSHIFT_EXPR 6576 || code == LROTATE_EXPR || code == RROTATE_EXPR) 6577 { 6578 /* Only call during the analysis stage, otherwise we'll lose 6579 STMT_VINFO_TYPE. We only support this for nested cycles 6580 without double reductions at the moment. */ 6581 if (!nested_cycle 6582 || double_reduc 6583 || (!vec_stmt && !vectorizable_shift (stmt_info, gsi, NULL, 6584 NULL, cost_vec))) 6585 { 6586 if (dump_enabled_p ()) 6587 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6588 "unsupported shift or rotation in reduction\n"); 6589 return false; 6590 } 6591 } 6592 else 6593 { 6594 /* 4. Supportable by target? */ 6595 6596 /* 4.1. check support for the operation in the loop */ 6597 optab = optab_for_tree_code (code, vectype_in, optab_default); 6598 if (!optab) 6599 { 6600 if (dump_enabled_p ()) 6601 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6602 "no optab.\n"); 6603 6604 return false; 6605 } 6606 6607 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing) 6608 { 6609 if (dump_enabled_p ()) 6610 dump_printf (MSG_NOTE, "op not supported by target.\n"); 6611 6612 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD) 6613 || !vect_worthwhile_without_simd_p (loop_vinfo, code)) 6614 return false; 6615 6616 if (dump_enabled_p ()) 6617 dump_printf (MSG_NOTE, "proceeding using word mode.\n"); 6618 } 6619 6620 /* Worthwhile without SIMD support? */ 6621 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in)) 6622 && !vect_worthwhile_without_simd_p (loop_vinfo, code)) 6623 { 6624 if (dump_enabled_p ()) 6625 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6626 "not worthwhile without SIMD support.\n"); 6627 6628 return false; 6629 } 6630 } 6631 6632 /* 4.2. Check support for the epilog operation. 6633 6634 If STMT represents a reduction pattern, then the type of the 6635 reduction variable may be different than the type of the rest 6636 of the arguments. For example, consider the case of accumulation 6637 of shorts into an int accumulator; The original code: 6638 S1: int_a = (int) short_a; 6639 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>; 6640 6641 was replaced with: 6642 STMT: int_acc = widen_sum <short_a, int_acc> 6643 6644 This means that: 6645 1. The tree-code that is used to create the vector operation in the 6646 epilog code (that reduces the partial results) is not the 6647 tree-code of STMT, but is rather the tree-code of the original 6648 stmt from the pattern that STMT is replacing. I.e, in the example 6649 above we want to use 'widen_sum' in the loop, but 'plus' in the 6650 epilog. 6651 2. The type (mode) we use to check available target support 6652 for the vector operation to be created in the *epilog*, is 6653 determined by the type of the reduction variable (in the example 6654 above we'd check this: optab_handler (plus_optab, vect_int_mode])). 6655 However the type (mode) we use to check available target support 6656 for the vector operation to be created *inside the loop*, is 6657 determined by the type of the other arguments to STMT (in the 6658 example we'd check this: optab_handler (widen_sum_optab, 6659 vect_short_mode)). 6660 6661 This is contrary to "regular" reductions, in which the types of all 6662 the arguments are the same as the type of the reduction variable. 6663 For "regular" reductions we can therefore use the same vector type 6664 (and also the same tree-code) when generating the epilog code and 6665 when generating the code inside the loop. */ 6666 6667 if (orig_stmt_info 6668 && (reduction_type == TREE_CODE_REDUCTION 6669 || reduction_type == FOLD_LEFT_REDUCTION)) 6670 { 6671 /* This is a reduction pattern: get the vectype from the type of the 6672 reduction variable, and get the tree-code from orig_stmt. */ 6673 orig_code = gimple_assign_rhs_code (orig_stmt_info->stmt); 6674 gcc_assert (vectype_out); 6675 vec_mode = TYPE_MODE (vectype_out); 6676 } 6677 else 6678 { 6679 /* Regular reduction: use the same vectype and tree-code as used for 6680 the vector code inside the loop can be used for the epilog code. */ 6681 orig_code = code; 6682 6683 if (code == MINUS_EXPR) 6684 orig_code = PLUS_EXPR; 6685 6686 /* For simple condition reductions, replace with the actual expression 6687 we want to base our reduction around. */ 6688 if (reduction_type == CONST_COND_REDUCTION) 6689 { 6690 orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info); 6691 gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR); 6692 } 6693 else if (reduction_type == INTEGER_INDUC_COND_REDUCTION) 6694 orig_code = cond_reduc_op_code; 6695 } 6696 6697 reduc_fn = IFN_LAST; 6698 6699 if (reduction_type == TREE_CODE_REDUCTION 6700 || reduction_type == FOLD_LEFT_REDUCTION 6701 || reduction_type == INTEGER_INDUC_COND_REDUCTION 6702 || reduction_type == CONST_COND_REDUCTION) 6703 { 6704 if (reduction_type == FOLD_LEFT_REDUCTION 6705 ? fold_left_reduction_fn (orig_code, &reduc_fn) 6706 : reduction_fn_for_scalar_code (orig_code, &reduc_fn)) 6707 { 6708 if (reduc_fn != IFN_LAST 6709 && !direct_internal_fn_supported_p (reduc_fn, vectype_out, 6710 OPTIMIZE_FOR_SPEED)) 6711 { 6712 if (dump_enabled_p ()) 6713 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6714 "reduc op not supported by target.\n"); 6715 6716 reduc_fn = IFN_LAST; 6717 } 6718 } 6719 else 6720 { 6721 if (!nested_cycle || double_reduc) 6722 { 6723 if (dump_enabled_p ()) 6724 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6725 "no reduc code for scalar code.\n"); 6726 6727 return false; 6728 } 6729 } 6730 } 6731 else if (reduction_type == COND_REDUCTION) 6732 { 6733 int scalar_precision 6734 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type)); 6735 cr_index_scalar_type = make_unsigned_type (scalar_precision); 6736 cr_index_vector_type = build_vector_type (cr_index_scalar_type, 6737 nunits_out); 6738 6739 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type, 6740 OPTIMIZE_FOR_SPEED)) 6741 reduc_fn = IFN_REDUC_MAX; 6742 } 6743 6744 if (reduction_type != EXTRACT_LAST_REDUCTION 6745 && (!nested_cycle || double_reduc) 6746 && reduc_fn == IFN_LAST 6747 && !nunits_out.is_constant ()) 6748 { 6749 if (dump_enabled_p ()) 6750 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6751 "missing target support for reduction on" 6752 " variable-length vectors.\n"); 6753 return false; 6754 } 6755 6756 /* For SLP reductions, see if there is a neutral value we can use. */ 6757 tree neutral_op = NULL_TREE; 6758 if (slp_node) 6759 neutral_op = neutral_op_for_slp_reduction 6760 (slp_node_instance->reduc_phis, code, 6761 REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL); 6762 6763 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION) 6764 { 6765 /* We can't support in-order reductions of code such as this: 6766 6767 for (int i = 0; i < n1; ++i) 6768 for (int j = 0; j < n2; ++j) 6769 l += a[j]; 6770 6771 since GCC effectively transforms the loop when vectorizing: 6772 6773 for (int i = 0; i < n1 / VF; ++i) 6774 for (int j = 0; j < n2; ++j) 6775 for (int k = 0; k < VF; ++k) 6776 l += a[j]; 6777 6778 which is a reassociation of the original operation. */ 6779 if (dump_enabled_p ()) 6780 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6781 "in-order double reduction not supported.\n"); 6782 6783 return false; 6784 } 6785 6786 if (reduction_type == FOLD_LEFT_REDUCTION 6787 && slp_node 6788 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)) 6789 { 6790 /* We cannot use in-order reductions in this case because there is 6791 an implicit reassociation of the operations involved. */ 6792 if (dump_enabled_p ()) 6793 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6794 "in-order unchained SLP reductions not supported.\n"); 6795 return false; 6796 } 6797 6798 /* For double reductions, and for SLP reductions with a neutral value, 6799 we construct a variable-length initial vector by loading a vector 6800 full of the neutral value and then shift-and-inserting the start 6801 values into the low-numbered elements. */ 6802 if ((double_reduc || neutral_op) 6803 && !nunits_out.is_constant () 6804 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT, 6805 vectype_out, OPTIMIZE_FOR_SPEED)) 6806 { 6807 if (dump_enabled_p ()) 6808 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6809 "reduction on variable-length vectors requires" 6810 " target support for a vector-shift-and-insert" 6811 " operation.\n"); 6812 return false; 6813 } 6814 6815 /* Check extra constraints for variable-length unchained SLP reductions. */ 6816 if (STMT_SLP_TYPE (stmt_info) 6817 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info) 6818 && !nunits_out.is_constant ()) 6819 { 6820 /* We checked above that we could build the initial vector when 6821 there's a neutral element value. Check here for the case in 6822 which each SLP statement has its own initial value and in which 6823 that value needs to be repeated for every instance of the 6824 statement within the initial vector. */ 6825 unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length (); 6826 scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out)); 6827 if (!neutral_op 6828 && !can_duplicate_and_interleave_p (group_size, elt_mode)) 6829 { 6830 if (dump_enabled_p ()) 6831 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6832 "unsupported form of SLP reduction for" 6833 " variable-length vectors: cannot build" 6834 " initial vector.\n"); 6835 return false; 6836 } 6837 /* The epilogue code relies on the number of elements being a multiple 6838 of the group size. The duplicate-and-interleave approach to setting 6839 up the the initial vector does too. */ 6840 if (!multiple_p (nunits_out, group_size)) 6841 { 6842 if (dump_enabled_p ()) 6843 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6844 "unsupported form of SLP reduction for" 6845 " variable-length vectors: the vector size" 6846 " is not a multiple of the number of results.\n"); 6847 return false; 6848 } 6849 } 6850 6851 /* In case of widenning multiplication by a constant, we update the type 6852 of the constant to be the type of the other operand. We check that the 6853 constant fits the type in the pattern recognition pass. */ 6854 if (code == DOT_PROD_EXPR 6855 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1]))) 6856 { 6857 if (TREE_CODE (ops[0]) == INTEGER_CST) 6858 ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]); 6859 else if (TREE_CODE (ops[1]) == INTEGER_CST) 6860 ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]); 6861 else 6862 { 6863 if (dump_enabled_p ()) 6864 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6865 "invalid types in dot-prod\n"); 6866 6867 return false; 6868 } 6869 } 6870 6871 if (reduction_type == COND_REDUCTION) 6872 { 6873 widest_int ni; 6874 6875 if (! max_loop_iterations (loop, &ni)) 6876 { 6877 if (dump_enabled_p ()) 6878 dump_printf_loc (MSG_NOTE, vect_location, 6879 "loop count not known, cannot create cond " 6880 "reduction.\n"); 6881 return false; 6882 } 6883 /* Convert backedges to iterations. */ 6884 ni += 1; 6885 6886 /* The additional index will be the same type as the condition. Check 6887 that the loop can fit into this less one (because we'll use up the 6888 zero slot for when there are no matches). */ 6889 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type); 6890 if (wi::geu_p (ni, wi::to_widest (max_index))) 6891 { 6892 if (dump_enabled_p ()) 6893 dump_printf_loc (MSG_NOTE, vect_location, 6894 "loop size is greater than data size.\n"); 6895 return false; 6896 } 6897 } 6898 6899 /* In case the vectorization factor (VF) is bigger than the number 6900 of elements that we can fit in a vectype (nunits), we have to generate 6901 more than one vector stmt - i.e - we need to "unroll" the 6902 vector stmt by a factor VF/nunits. For more details see documentation 6903 in vectorizable_operation. */ 6904 6905 /* If the reduction is used in an outer loop we need to generate 6906 VF intermediate results, like so (e.g. for ncopies=2): 6907 r0 = phi (init, r0) 6908 r1 = phi (init, r1) 6909 r0 = x0 + r0; 6910 r1 = x1 + r1; 6911 (i.e. we generate VF results in 2 registers). 6912 In this case we have a separate def-use cycle for each copy, and therefore 6913 for each copy we get the vector def for the reduction variable from the 6914 respective phi node created for this copy. 6915 6916 Otherwise (the reduction is unused in the loop nest), we can combine 6917 together intermediate results, like so (e.g. for ncopies=2): 6918 r = phi (init, r) 6919 r = x0 + r; 6920 r = x1 + r; 6921 (i.e. we generate VF/2 results in a single register). 6922 In this case for each copy we get the vector def for the reduction variable 6923 from the vectorized reduction operation generated in the previous iteration. 6924 6925 This only works when we see both the reduction PHI and its only consumer 6926 in vectorizable_reduction and there are no intermediate stmts 6927 participating. */ 6928 stmt_vec_info use_stmt_info; 6929 tree reduc_phi_result = gimple_phi_result (reduc_def_phi); 6930 if (ncopies > 1 6931 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live) 6932 && (use_stmt_info = loop_vinfo->lookup_single_use (reduc_phi_result)) 6933 && vect_stmt_to_vectorize (use_stmt_info) == stmt_info) 6934 { 6935 single_defuse_cycle = true; 6936 epilog_copies = 1; 6937 } 6938 else 6939 epilog_copies = ncopies; 6940 6941 /* If the reduction stmt is one of the patterns that have lane 6942 reduction embedded we cannot handle the case of ! single_defuse_cycle. */ 6943 if ((ncopies > 1 6944 && ! single_defuse_cycle) 6945 && (code == DOT_PROD_EXPR 6946 || code == WIDEN_SUM_EXPR 6947 || code == SAD_EXPR)) 6948 { 6949 if (dump_enabled_p ()) 6950 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6951 "multi def-use cycle not possible for lane-reducing " 6952 "reduction operation\n"); 6953 return false; 6954 } 6955 6956 if (slp_node) 6957 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); 6958 else 6959 vec_num = 1; 6960 6961 internal_fn cond_fn = get_conditional_internal_fn (code); 6962 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo); 6963 6964 if (!vec_stmt) /* transformation not required. */ 6965 { 6966 vect_model_reduction_cost (stmt_info, reduc_fn, ncopies, cost_vec); 6967 if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)) 6968 { 6969 if (reduction_type != FOLD_LEFT_REDUCTION 6970 && (cond_fn == IFN_LAST 6971 || !direct_internal_fn_supported_p (cond_fn, vectype_in, 6972 OPTIMIZE_FOR_SPEED))) 6973 { 6974 if (dump_enabled_p ()) 6975 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6976 "can't use a fully-masked loop because no" 6977 " conditional operation is available.\n"); 6978 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false; 6979 } 6980 else if (reduc_index == -1) 6981 { 6982 if (dump_enabled_p ()) 6983 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6984 "can't use a fully-masked loop for chained" 6985 " reductions.\n"); 6986 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false; 6987 } 6988 else 6989 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num, 6990 vectype_in); 6991 } 6992 if (dump_enabled_p () 6993 && reduction_type == FOLD_LEFT_REDUCTION) 6994 dump_printf_loc (MSG_NOTE, vect_location, 6995 "using an in-order (fold-left) reduction.\n"); 6996 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type; 6997 return true; 6998 } 6999 7000 /* Transform. */ 7001 7002 if (dump_enabled_p ()) 7003 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n"); 7004 7005 /* FORNOW: Multiple types are not supported for condition. */ 7006 if (code == COND_EXPR) 7007 gcc_assert (ncopies == 1); 7008 7009 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo); 7010 7011 if (reduction_type == FOLD_LEFT_REDUCTION) 7012 return vectorize_fold_left_reduction 7013 (stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code, 7014 reduc_fn, ops, vectype_in, reduc_index, masks); 7015 7016 if (reduction_type == EXTRACT_LAST_REDUCTION) 7017 { 7018 gcc_assert (!slp_node); 7019 return vectorizable_condition (stmt_info, gsi, vec_stmt, 7020 true, NULL, NULL); 7021 } 7022 7023 /* Create the destination vector */ 7024 vec_dest = vect_create_destination_var (scalar_dest, vectype_out); 7025 7026 prev_stmt_info = NULL; 7027 prev_phi_info = NULL; 7028 if (!slp_node) 7029 { 7030 vec_oprnds0.create (1); 7031 vec_oprnds1.create (1); 7032 if (op_type == ternary_op) 7033 vec_oprnds2.create (1); 7034 } 7035 7036 phis.create (vec_num); 7037 vect_defs.create (vec_num); 7038 if (!slp_node) 7039 vect_defs.quick_push (NULL_TREE); 7040 7041 if (slp_node) 7042 phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis)); 7043 else 7044 phis.quick_push (STMT_VINFO_VEC_STMT (reduc_def_info)); 7045 7046 for (j = 0; j < ncopies; j++) 7047 { 7048 if (code == COND_EXPR) 7049 { 7050 gcc_assert (!slp_node); 7051 vectorizable_condition (stmt_info, gsi, vec_stmt, 7052 true, NULL, NULL); 7053 break; 7054 } 7055 if (code == LSHIFT_EXPR 7056 || code == RSHIFT_EXPR) 7057 { 7058 vectorizable_shift (stmt_info, gsi, vec_stmt, slp_node, NULL); 7059 break; 7060 } 7061 7062 /* Handle uses. */ 7063 if (j == 0) 7064 { 7065 if (slp_node) 7066 { 7067 /* Get vec defs for all the operands except the reduction index, 7068 ensuring the ordering of the ops in the vector is kept. */ 7069 auto_vec<tree, 3> slp_ops; 7070 auto_vec<vec<tree>, 3> vec_defs; 7071 7072 slp_ops.quick_push (ops[0]); 7073 slp_ops.quick_push (ops[1]); 7074 if (op_type == ternary_op) 7075 slp_ops.quick_push (ops[2]); 7076 7077 vect_get_slp_defs (slp_ops, slp_node, &vec_defs); 7078 7079 vec_oprnds0.safe_splice (vec_defs[0]); 7080 vec_defs[0].release (); 7081 vec_oprnds1.safe_splice (vec_defs[1]); 7082 vec_defs[1].release (); 7083 if (op_type == ternary_op) 7084 { 7085 vec_oprnds2.safe_splice (vec_defs[2]); 7086 vec_defs[2].release (); 7087 } 7088 } 7089 else 7090 { 7091 vec_oprnds0.quick_push 7092 (vect_get_vec_def_for_operand (ops[0], stmt_info)); 7093 vec_oprnds1.quick_push 7094 (vect_get_vec_def_for_operand (ops[1], stmt_info)); 7095 if (op_type == ternary_op) 7096 vec_oprnds2.quick_push 7097 (vect_get_vec_def_for_operand (ops[2], stmt_info)); 7098 } 7099 } 7100 else 7101 { 7102 if (!slp_node) 7103 { 7104 gcc_assert (reduc_index != -1 || ! single_defuse_cycle); 7105 7106 if (single_defuse_cycle && reduc_index == 0) 7107 vec_oprnds0[0] = gimple_get_lhs (new_stmt_info->stmt); 7108 else 7109 vec_oprnds0[0] 7110 = vect_get_vec_def_for_stmt_copy (loop_vinfo, 7111 vec_oprnds0[0]); 7112 if (single_defuse_cycle && reduc_index == 1) 7113 vec_oprnds1[0] = gimple_get_lhs (new_stmt_info->stmt); 7114 else 7115 vec_oprnds1[0] 7116 = vect_get_vec_def_for_stmt_copy (loop_vinfo, 7117 vec_oprnds1[0]); 7118 if (op_type == ternary_op) 7119 { 7120 if (single_defuse_cycle && reduc_index == 2) 7121 vec_oprnds2[0] = gimple_get_lhs (new_stmt_info->stmt); 7122 else 7123 vec_oprnds2[0] 7124 = vect_get_vec_def_for_stmt_copy (loop_vinfo, 7125 vec_oprnds2[0]); 7126 } 7127 } 7128 } 7129 7130 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0) 7131 { 7132 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE }; 7133 if (masked_loop_p) 7134 { 7135 /* Make sure that the reduction accumulator is vop[0]. */ 7136 if (reduc_index == 1) 7137 { 7138 gcc_assert (commutative_tree_code (code)); 7139 std::swap (vop[0], vop[1]); 7140 } 7141 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies, 7142 vectype_in, i * ncopies + j); 7143 gcall *call = gimple_build_call_internal (cond_fn, 4, mask, 7144 vop[0], vop[1], 7145 vop[0]); 7146 new_temp = make_ssa_name (vec_dest, call); 7147 gimple_call_set_lhs (call, new_temp); 7148 gimple_call_set_nothrow (call, true); 7149 new_stmt_info 7150 = vect_finish_stmt_generation (stmt_info, call, gsi); 7151 } 7152 else 7153 { 7154 if (op_type == ternary_op) 7155 vop[2] = vec_oprnds2[i]; 7156 7157 gassign *new_stmt = gimple_build_assign (vec_dest, code, 7158 vop[0], vop[1], vop[2]); 7159 new_temp = make_ssa_name (vec_dest, new_stmt); 7160 gimple_assign_set_lhs (new_stmt, new_temp); 7161 new_stmt_info 7162 = vect_finish_stmt_generation (stmt_info, new_stmt, gsi); 7163 } 7164 7165 if (slp_node) 7166 { 7167 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info); 7168 vect_defs.quick_push (new_temp); 7169 } 7170 else 7171 vect_defs[0] = new_temp; 7172 } 7173 7174 if (slp_node) 7175 continue; 7176 7177 if (j == 0) 7178 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info; 7179 else 7180 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info; 7181 7182 prev_stmt_info = new_stmt_info; 7183 } 7184 7185 /* Finalize the reduction-phi (set its arguments) and create the 7186 epilog reduction code. */ 7187 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node) 7188 vect_defs[0] = gimple_get_lhs ((*vec_stmt)->stmt); 7189 7190 vect_create_epilog_for_reduction (vect_defs, stmt_info, reduc_def_phi, 7191 epilog_copies, reduc_fn, phis, 7192 double_reduc, slp_node, slp_node_instance, 7193 cond_reduc_val, cond_reduc_op_code, 7194 neutral_op); 7195 7196 return true; 7197 } 7198 7199 /* Function vect_min_worthwhile_factor. 7200 7201 For a loop where we could vectorize the operation indicated by CODE, 7202 return the minimum vectorization factor that makes it worthwhile 7203 to use generic vectors. */ 7204 static unsigned int 7205 vect_min_worthwhile_factor (enum tree_code code) 7206 { 7207 switch (code) 7208 { 7209 case PLUS_EXPR: 7210 case MINUS_EXPR: 7211 case NEGATE_EXPR: 7212 return 4; 7213 7214 case BIT_AND_EXPR: 7215 case BIT_IOR_EXPR: 7216 case BIT_XOR_EXPR: 7217 case BIT_NOT_EXPR: 7218 return 2; 7219 7220 default: 7221 return INT_MAX; 7222 } 7223 } 7224 7225 /* Return true if VINFO indicates we are doing loop vectorization and if 7226 it is worth decomposing CODE operations into scalar operations for 7227 that loop's vectorization factor. */ 7228 7229 bool 7230 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code) 7231 { 7232 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo); 7233 unsigned HOST_WIDE_INT value; 7234 return (loop_vinfo 7235 && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value) 7236 && value >= vect_min_worthwhile_factor (code)); 7237 } 7238 7239 /* Function vectorizable_induction 7240 7241 Check if STMT_INFO performs an induction computation that can be vectorized. 7242 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized 7243 phi to replace it, put it in VEC_STMT, and add it to the same basic block. 7244 Return true if STMT_INFO is vectorizable in this way. */ 7245 7246 bool 7247 vectorizable_induction (stmt_vec_info stmt_info, 7248 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED, 7249 stmt_vec_info *vec_stmt, slp_tree slp_node, 7250 stmt_vector_for_cost *cost_vec) 7251 { 7252 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 7253 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 7254 unsigned ncopies; 7255 bool nested_in_vect_loop = false; 7256 struct loop *iv_loop; 7257 tree vec_def; 7258 edge pe = loop_preheader_edge (loop); 7259 basic_block new_bb; 7260 tree new_vec, vec_init, vec_step, t; 7261 tree new_name; 7262 gimple *new_stmt; 7263 gphi *induction_phi; 7264 tree induc_def, vec_dest; 7265 tree init_expr, step_expr; 7266 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 7267 unsigned i; 7268 tree expr; 7269 gimple_seq stmts; 7270 imm_use_iterator imm_iter; 7271 use_operand_p use_p; 7272 gimple *exit_phi; 7273 edge latch_e; 7274 tree loop_arg; 7275 gimple_stmt_iterator si; 7276 7277 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt); 7278 if (!phi) 7279 return false; 7280 7281 if (!STMT_VINFO_RELEVANT_P (stmt_info)) 7282 return false; 7283 7284 /* Make sure it was recognized as induction computation. */ 7285 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def) 7286 return false; 7287 7288 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 7289 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype); 7290 7291 if (slp_node) 7292 ncopies = 1; 7293 else 7294 ncopies = vect_get_num_copies (loop_vinfo, vectype); 7295 gcc_assert (ncopies >= 1); 7296 7297 /* FORNOW. These restrictions should be relaxed. */ 7298 if (nested_in_vect_loop_p (loop, stmt_info)) 7299 { 7300 imm_use_iterator imm_iter; 7301 use_operand_p use_p; 7302 gimple *exit_phi; 7303 edge latch_e; 7304 tree loop_arg; 7305 7306 if (ncopies > 1) 7307 { 7308 if (dump_enabled_p ()) 7309 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 7310 "multiple types in nested loop.\n"); 7311 return false; 7312 } 7313 7314 /* FORNOW: outer loop induction with SLP not supported. */ 7315 if (STMT_SLP_TYPE (stmt_info)) 7316 return false; 7317 7318 exit_phi = NULL; 7319 latch_e = loop_latch_edge (loop->inner); 7320 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e); 7321 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg) 7322 { 7323 gimple *use_stmt = USE_STMT (use_p); 7324 if (is_gimple_debug (use_stmt)) 7325 continue; 7326 7327 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt))) 7328 { 7329 exit_phi = use_stmt; 7330 break; 7331 } 7332 } 7333 if (exit_phi) 7334 { 7335 stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi); 7336 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo) 7337 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))) 7338 { 7339 if (dump_enabled_p ()) 7340 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 7341 "inner-loop induction only used outside " 7342 "of the outer vectorized loop.\n"); 7343 return false; 7344 } 7345 } 7346 7347 nested_in_vect_loop = true; 7348 iv_loop = loop->inner; 7349 } 7350 else 7351 iv_loop = loop; 7352 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father); 7353 7354 if (slp_node && !nunits.is_constant ()) 7355 { 7356 /* The current SLP code creates the initial value element-by-element. */ 7357 if (dump_enabled_p ()) 7358 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 7359 "SLP induction not supported for variable-length" 7360 " vectors.\n"); 7361 return false; 7362 } 7363 7364 if (!vec_stmt) /* transformation not required. */ 7365 { 7366 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type; 7367 DUMP_VECT_SCOPE ("vectorizable_induction"); 7368 vect_model_induction_cost (stmt_info, ncopies, cost_vec); 7369 return true; 7370 } 7371 7372 /* Transform. */ 7373 7374 /* Compute a vector variable, initialized with the first VF values of 7375 the induction variable. E.g., for an iv with IV_PHI='X' and 7376 evolution S, for a vector of 4 units, we want to compute: 7377 [X, X + S, X + 2*S, X + 3*S]. */ 7378 7379 if (dump_enabled_p ()) 7380 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n"); 7381 7382 latch_e = loop_latch_edge (iv_loop); 7383 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e); 7384 7385 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info); 7386 gcc_assert (step_expr != NULL_TREE); 7387 7388 pe = loop_preheader_edge (iv_loop); 7389 init_expr = PHI_ARG_DEF_FROM_EDGE (phi, 7390 loop_preheader_edge (iv_loop)); 7391 7392 stmts = NULL; 7393 if (!nested_in_vect_loop) 7394 { 7395 /* Convert the initial value to the desired type. */ 7396 tree new_type = TREE_TYPE (vectype); 7397 init_expr = gimple_convert (&stmts, new_type, init_expr); 7398 7399 /* If we are using the loop mask to "peel" for alignment then we need 7400 to adjust the start value here. */ 7401 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo); 7402 if (skip_niters != NULL_TREE) 7403 { 7404 if (FLOAT_TYPE_P (vectype)) 7405 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type, 7406 skip_niters); 7407 else 7408 skip_niters = gimple_convert (&stmts, new_type, skip_niters); 7409 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type, 7410 skip_niters, step_expr); 7411 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type, 7412 init_expr, skip_step); 7413 } 7414 } 7415 7416 /* Convert the step to the desired type. */ 7417 step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr); 7418 7419 if (stmts) 7420 { 7421 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); 7422 gcc_assert (!new_bb); 7423 } 7424 7425 /* Find the first insertion point in the BB. */ 7426 basic_block bb = gimple_bb (phi); 7427 si = gsi_after_labels (bb); 7428 7429 /* For SLP induction we have to generate several IVs as for example 7430 with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S] 7431 [i + 2*S, i + 3*S, i + 3*S, i + 3*S]. The step is the same uniform 7432 [VF*S, VF*S, VF*S, VF*S] for all. */ 7433 if (slp_node) 7434 { 7435 /* Enforced above. */ 7436 unsigned int const_nunits = nunits.to_constant (); 7437 7438 /* Generate [VF*S, VF*S, ... ]. */ 7439 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))) 7440 { 7441 expr = build_int_cst (integer_type_node, vf); 7442 expr = fold_convert (TREE_TYPE (step_expr), expr); 7443 } 7444 else 7445 expr = build_int_cst (TREE_TYPE (step_expr), vf); 7446 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr), 7447 expr, step_expr); 7448 if (! CONSTANT_CLASS_P (new_name)) 7449 new_name = vect_init_vector (stmt_info, new_name, 7450 TREE_TYPE (step_expr), NULL); 7451 new_vec = build_vector_from_val (vectype, new_name); 7452 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL); 7453 7454 /* Now generate the IVs. */ 7455 unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length (); 7456 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); 7457 unsigned elts = const_nunits * nvects; 7458 unsigned nivs = least_common_multiple (group_size, 7459 const_nunits) / const_nunits; 7460 gcc_assert (elts % group_size == 0); 7461 tree elt = init_expr; 7462 unsigned ivn; 7463 for (ivn = 0; ivn < nivs; ++ivn) 7464 { 7465 tree_vector_builder elts (vectype, const_nunits, 1); 7466 stmts = NULL; 7467 for (unsigned eltn = 0; eltn < const_nunits; ++eltn) 7468 { 7469 if (ivn*const_nunits + eltn >= group_size 7470 && (ivn * const_nunits + eltn) % group_size == 0) 7471 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt), 7472 elt, step_expr); 7473 elts.quick_push (elt); 7474 } 7475 vec_init = gimple_build_vector (&stmts, &elts); 7476 if (stmts) 7477 { 7478 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); 7479 gcc_assert (!new_bb); 7480 } 7481 7482 /* Create the induction-phi that defines the induction-operand. */ 7483 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_"); 7484 induction_phi = create_phi_node (vec_dest, iv_loop->header); 7485 stmt_vec_info induction_phi_info 7486 = loop_vinfo->add_stmt (induction_phi); 7487 induc_def = PHI_RESULT (induction_phi); 7488 7489 /* Create the iv update inside the loop */ 7490 vec_def = make_ssa_name (vec_dest); 7491 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step); 7492 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT); 7493 loop_vinfo->add_stmt (new_stmt); 7494 7495 /* Set the arguments of the phi node: */ 7496 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION); 7497 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop), 7498 UNKNOWN_LOCATION); 7499 7500 SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi_info); 7501 } 7502 7503 /* Re-use IVs when we can. */ 7504 if (ivn < nvects) 7505 { 7506 unsigned vfp 7507 = least_common_multiple (group_size, const_nunits) / group_size; 7508 /* Generate [VF'*S, VF'*S, ... ]. */ 7509 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))) 7510 { 7511 expr = build_int_cst (integer_type_node, vfp); 7512 expr = fold_convert (TREE_TYPE (step_expr), expr); 7513 } 7514 else 7515 expr = build_int_cst (TREE_TYPE (step_expr), vfp); 7516 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr), 7517 expr, step_expr); 7518 if (! CONSTANT_CLASS_P (new_name)) 7519 new_name = vect_init_vector (stmt_info, new_name, 7520 TREE_TYPE (step_expr), NULL); 7521 new_vec = build_vector_from_val (vectype, new_name); 7522 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL); 7523 for (; ivn < nvects; ++ivn) 7524 { 7525 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs]->stmt; 7526 tree def; 7527 if (gimple_code (iv) == GIMPLE_PHI) 7528 def = gimple_phi_result (iv); 7529 else 7530 def = gimple_assign_lhs (iv); 7531 new_stmt = gimple_build_assign (make_ssa_name (vectype), 7532 PLUS_EXPR, 7533 def, vec_step); 7534 if (gimple_code (iv) == GIMPLE_PHI) 7535 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT); 7536 else 7537 { 7538 gimple_stmt_iterator tgsi = gsi_for_stmt (iv); 7539 gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING); 7540 } 7541 SLP_TREE_VEC_STMTS (slp_node).quick_push 7542 (loop_vinfo->add_stmt (new_stmt)); 7543 } 7544 } 7545 7546 return true; 7547 } 7548 7549 /* Create the vector that holds the initial_value of the induction. */ 7550 if (nested_in_vect_loop) 7551 { 7552 /* iv_loop is nested in the loop to be vectorized. init_expr had already 7553 been created during vectorization of previous stmts. We obtain it 7554 from the STMT_VINFO_VEC_STMT of the defining stmt. */ 7555 vec_init = vect_get_vec_def_for_operand (init_expr, stmt_info); 7556 /* If the initial value is not of proper type, convert it. */ 7557 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init))) 7558 { 7559 new_stmt 7560 = gimple_build_assign (vect_get_new_ssa_name (vectype, 7561 vect_simple_var, 7562 "vec_iv_"), 7563 VIEW_CONVERT_EXPR, 7564 build1 (VIEW_CONVERT_EXPR, vectype, 7565 vec_init)); 7566 vec_init = gimple_assign_lhs (new_stmt); 7567 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop), 7568 new_stmt); 7569 gcc_assert (!new_bb); 7570 loop_vinfo->add_stmt (new_stmt); 7571 } 7572 } 7573 else 7574 { 7575 /* iv_loop is the loop to be vectorized. Create: 7576 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */ 7577 stmts = NULL; 7578 new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr); 7579 7580 unsigned HOST_WIDE_INT const_nunits; 7581 if (nunits.is_constant (&const_nunits)) 7582 { 7583 tree_vector_builder elts (vectype, const_nunits, 1); 7584 elts.quick_push (new_name); 7585 for (i = 1; i < const_nunits; i++) 7586 { 7587 /* Create: new_name_i = new_name + step_expr */ 7588 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name), 7589 new_name, step_expr); 7590 elts.quick_push (new_name); 7591 } 7592 /* Create a vector from [new_name_0, new_name_1, ..., 7593 new_name_nunits-1] */ 7594 vec_init = gimple_build_vector (&stmts, &elts); 7595 } 7596 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr))) 7597 /* Build the initial value directly from a VEC_SERIES_EXPR. */ 7598 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype, 7599 new_name, step_expr); 7600 else 7601 { 7602 /* Build: 7603 [base, base, base, ...] 7604 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */ 7605 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))); 7606 gcc_assert (flag_associative_math); 7607 tree index = build_index_vector (vectype, 0, 1); 7608 tree base_vec = gimple_build_vector_from_val (&stmts, vectype, 7609 new_name); 7610 tree step_vec = gimple_build_vector_from_val (&stmts, vectype, 7611 step_expr); 7612 vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index); 7613 vec_init = gimple_build (&stmts, MULT_EXPR, vectype, 7614 vec_init, step_vec); 7615 vec_init = gimple_build (&stmts, PLUS_EXPR, vectype, 7616 vec_init, base_vec); 7617 } 7618 7619 if (stmts) 7620 { 7621 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); 7622 gcc_assert (!new_bb); 7623 } 7624 } 7625 7626 7627 /* Create the vector that holds the step of the induction. */ 7628 if (nested_in_vect_loop) 7629 /* iv_loop is nested in the loop to be vectorized. Generate: 7630 vec_step = [S, S, S, S] */ 7631 new_name = step_expr; 7632 else 7633 { 7634 /* iv_loop is the loop to be vectorized. Generate: 7635 vec_step = [VF*S, VF*S, VF*S, VF*S] */ 7636 gimple_seq seq = NULL; 7637 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))) 7638 { 7639 expr = build_int_cst (integer_type_node, vf); 7640 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr); 7641 } 7642 else 7643 expr = build_int_cst (TREE_TYPE (step_expr), vf); 7644 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr), 7645 expr, step_expr); 7646 if (seq) 7647 { 7648 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq); 7649 gcc_assert (!new_bb); 7650 } 7651 } 7652 7653 t = unshare_expr (new_name); 7654 gcc_assert (CONSTANT_CLASS_P (new_name) 7655 || TREE_CODE (new_name) == SSA_NAME); 7656 new_vec = build_vector_from_val (vectype, t); 7657 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL); 7658 7659 7660 /* Create the following def-use cycle: 7661 loop prolog: 7662 vec_init = ... 7663 vec_step = ... 7664 loop: 7665 vec_iv = PHI <vec_init, vec_loop> 7666 ... 7667 STMT 7668 ... 7669 vec_loop = vec_iv + vec_step; */ 7670 7671 /* Create the induction-phi that defines the induction-operand. */ 7672 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_"); 7673 induction_phi = create_phi_node (vec_dest, iv_loop->header); 7674 stmt_vec_info induction_phi_info = loop_vinfo->add_stmt (induction_phi); 7675 induc_def = PHI_RESULT (induction_phi); 7676 7677 /* Create the iv update inside the loop */ 7678 vec_def = make_ssa_name (vec_dest); 7679 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step); 7680 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT); 7681 stmt_vec_info new_stmt_info = loop_vinfo->add_stmt (new_stmt); 7682 7683 /* Set the arguments of the phi node: */ 7684 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION); 7685 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop), 7686 UNKNOWN_LOCATION); 7687 7688 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi_info; 7689 7690 /* In case that vectorization factor (VF) is bigger than the number 7691 of elements that we can fit in a vectype (nunits), we have to generate 7692 more than one vector stmt - i.e - we need to "unroll" the 7693 vector stmt by a factor VF/nunits. For more details see documentation 7694 in vectorizable_operation. */ 7695 7696 if (ncopies > 1) 7697 { 7698 gimple_seq seq = NULL; 7699 stmt_vec_info prev_stmt_vinfo; 7700 /* FORNOW. This restriction should be relaxed. */ 7701 gcc_assert (!nested_in_vect_loop); 7702 7703 /* Create the vector that holds the step of the induction. */ 7704 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))) 7705 { 7706 expr = build_int_cst (integer_type_node, nunits); 7707 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr); 7708 } 7709 else 7710 expr = build_int_cst (TREE_TYPE (step_expr), nunits); 7711 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr), 7712 expr, step_expr); 7713 if (seq) 7714 { 7715 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq); 7716 gcc_assert (!new_bb); 7717 } 7718 7719 t = unshare_expr (new_name); 7720 gcc_assert (CONSTANT_CLASS_P (new_name) 7721 || TREE_CODE (new_name) == SSA_NAME); 7722 new_vec = build_vector_from_val (vectype, t); 7723 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL); 7724 7725 vec_def = induc_def; 7726 prev_stmt_vinfo = induction_phi_info; 7727 for (i = 1; i < ncopies; i++) 7728 { 7729 /* vec_i = vec_prev + vec_step */ 7730 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR, 7731 vec_def, vec_step); 7732 vec_def = make_ssa_name (vec_dest, new_stmt); 7733 gimple_assign_set_lhs (new_stmt, vec_def); 7734 7735 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT); 7736 new_stmt_info = loop_vinfo->add_stmt (new_stmt); 7737 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt_info; 7738 prev_stmt_vinfo = new_stmt_info; 7739 } 7740 } 7741 7742 if (nested_in_vect_loop) 7743 { 7744 /* Find the loop-closed exit-phi of the induction, and record 7745 the final vector of induction results: */ 7746 exit_phi = NULL; 7747 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg) 7748 { 7749 gimple *use_stmt = USE_STMT (use_p); 7750 if (is_gimple_debug (use_stmt)) 7751 continue; 7752 7753 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt))) 7754 { 7755 exit_phi = use_stmt; 7756 break; 7757 } 7758 } 7759 if (exit_phi) 7760 { 7761 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (exit_phi); 7762 /* FORNOW. Currently not supporting the case that an inner-loop induction 7763 is not used in the outer-loop (i.e. only outside the outer-loop). */ 7764 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo) 7765 && !STMT_VINFO_LIVE_P (stmt_vinfo)); 7766 7767 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt_info; 7768 if (dump_enabled_p ()) 7769 dump_printf_loc (MSG_NOTE, vect_location, 7770 "vector of inductions after inner-loop:%G", 7771 new_stmt); 7772 } 7773 } 7774 7775 7776 if (dump_enabled_p ()) 7777 dump_printf_loc (MSG_NOTE, vect_location, 7778 "transform induction: created def-use cycle: %G%G", 7779 induction_phi, SSA_NAME_DEF_STMT (vec_def)); 7780 7781 return true; 7782 } 7783 7784 /* Function vectorizable_live_operation. 7785 7786 STMT_INFO computes a value that is used outside the loop. Check if 7787 it can be supported. */ 7788 7789 bool 7790 vectorizable_live_operation (stmt_vec_info stmt_info, 7791 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED, 7792 slp_tree slp_node, int slp_index, 7793 stmt_vec_info *vec_stmt, 7794 stmt_vector_for_cost *) 7795 { 7796 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 7797 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 7798 imm_use_iterator imm_iter; 7799 tree lhs, lhs_type, bitsize, vec_bitsize; 7800 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 7801 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype); 7802 int ncopies; 7803 gimple *use_stmt; 7804 auto_vec<tree> vec_oprnds; 7805 int vec_entry = 0; 7806 poly_uint64 vec_index = 0; 7807 7808 gcc_assert (STMT_VINFO_LIVE_P (stmt_info)); 7809 7810 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def) 7811 return false; 7812 7813 /* FORNOW. CHECKME. */ 7814 if (nested_in_vect_loop_p (loop, stmt_info)) 7815 return false; 7816 7817 /* If STMT is not relevant and it is a simple assignment and its inputs are 7818 invariant then it can remain in place, unvectorized. The original last 7819 scalar value that it computes will be used. */ 7820 if (!STMT_VINFO_RELEVANT_P (stmt_info)) 7821 { 7822 gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo)); 7823 if (dump_enabled_p ()) 7824 dump_printf_loc (MSG_NOTE, vect_location, 7825 "statement is simple and uses invariant. Leaving in " 7826 "place.\n"); 7827 return true; 7828 } 7829 7830 if (slp_node) 7831 ncopies = 1; 7832 else 7833 ncopies = vect_get_num_copies (loop_vinfo, vectype); 7834 7835 if (slp_node) 7836 { 7837 gcc_assert (slp_index >= 0); 7838 7839 int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length (); 7840 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); 7841 7842 /* Get the last occurrence of the scalar index from the concatenation of 7843 all the slp vectors. Calculate which slp vector it is and the index 7844 within. */ 7845 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index; 7846 7847 /* Calculate which vector contains the result, and which lane of 7848 that vector we need. */ 7849 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index)) 7850 { 7851 if (dump_enabled_p ()) 7852 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 7853 "Cannot determine which vector holds the" 7854 " final result.\n"); 7855 return false; 7856 } 7857 } 7858 7859 if (!vec_stmt) 7860 { 7861 /* No transformation required. */ 7862 if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)) 7863 { 7864 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype, 7865 OPTIMIZE_FOR_SPEED)) 7866 { 7867 if (dump_enabled_p ()) 7868 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 7869 "can't use a fully-masked loop because " 7870 "the target doesn't support extract last " 7871 "reduction.\n"); 7872 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false; 7873 } 7874 else if (slp_node) 7875 { 7876 if (dump_enabled_p ()) 7877 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 7878 "can't use a fully-masked loop because an " 7879 "SLP statement is live after the loop.\n"); 7880 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false; 7881 } 7882 else if (ncopies > 1) 7883 { 7884 if (dump_enabled_p ()) 7885 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 7886 "can't use a fully-masked loop because" 7887 " ncopies is greater than 1.\n"); 7888 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false; 7889 } 7890 else 7891 { 7892 gcc_assert (ncopies == 1 && !slp_node); 7893 vect_record_loop_mask (loop_vinfo, 7894 &LOOP_VINFO_MASKS (loop_vinfo), 7895 1, vectype); 7896 } 7897 } 7898 return true; 7899 } 7900 7901 /* Use the lhs of the original scalar statement. */ 7902 gimple *stmt = vect_orig_stmt (stmt_info)->stmt; 7903 7904 lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt) 7905 : gimple_get_lhs (stmt); 7906 lhs_type = TREE_TYPE (lhs); 7907 7908 bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype) 7909 ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype))) 7910 : TYPE_SIZE (TREE_TYPE (vectype))); 7911 vec_bitsize = TYPE_SIZE (vectype); 7912 7913 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */ 7914 tree vec_lhs, bitstart; 7915 if (slp_node) 7916 { 7917 gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)); 7918 7919 /* Get the correct slp vectorized stmt. */ 7920 gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry]->stmt; 7921 if (gphi *phi = dyn_cast <gphi *> (vec_stmt)) 7922 vec_lhs = gimple_phi_result (phi); 7923 else 7924 vec_lhs = gimple_get_lhs (vec_stmt); 7925 7926 /* Get entry to use. */ 7927 bitstart = bitsize_int (vec_index); 7928 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart); 7929 } 7930 else 7931 { 7932 enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info); 7933 vec_lhs = vect_get_vec_def_for_operand_1 (stmt_info, dt); 7934 gcc_checking_assert (ncopies == 1 7935 || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)); 7936 7937 /* For multiple copies, get the last copy. */ 7938 for (int i = 1; i < ncopies; ++i) 7939 vec_lhs = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_lhs); 7940 7941 /* Get the last lane in the vector. */ 7942 bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize); 7943 } 7944 7945 gimple_seq stmts = NULL; 7946 tree new_tree; 7947 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) 7948 { 7949 /* Emit: 7950 7951 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK> 7952 7953 where VEC_LHS is the vectorized live-out result and MASK is 7954 the loop mask for the final iteration. */ 7955 gcc_assert (ncopies == 1 && !slp_node); 7956 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info)); 7957 tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo), 7958 1, vectype, 0); 7959 tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, 7960 scalar_type, mask, vec_lhs); 7961 7962 /* Convert the extracted vector element to the required scalar type. */ 7963 new_tree = gimple_convert (&stmts, lhs_type, scalar_res); 7964 } 7965 else 7966 { 7967 tree bftype = TREE_TYPE (vectype); 7968 if (VECTOR_BOOLEAN_TYPE_P (vectype)) 7969 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1); 7970 new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart); 7971 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree), 7972 &stmts, true, NULL_TREE); 7973 } 7974 7975 if (stmts) 7976 gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts); 7977 7978 /* Replace use of lhs with newly computed result. If the use stmt is a 7979 single arg PHI, just replace all uses of PHI result. It's necessary 7980 because lcssa PHI defining lhs may be before newly inserted stmt. */ 7981 use_operand_p use_p; 7982 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs) 7983 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)) 7984 && !is_gimple_debug (use_stmt)) 7985 { 7986 if (gimple_code (use_stmt) == GIMPLE_PHI 7987 && gimple_phi_num_args (use_stmt) == 1) 7988 { 7989 replace_uses_by (gimple_phi_result (use_stmt), new_tree); 7990 } 7991 else 7992 { 7993 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter) 7994 SET_USE (use_p, new_tree); 7995 } 7996 update_stmt (use_stmt); 7997 } 7998 7999 return true; 8000 } 8001 8002 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */ 8003 8004 static void 8005 vect_loop_kill_debug_uses (struct loop *loop, stmt_vec_info stmt_info) 8006 { 8007 ssa_op_iter op_iter; 8008 imm_use_iterator imm_iter; 8009 def_operand_p def_p; 8010 gimple *ustmt; 8011 8012 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF) 8013 { 8014 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p)) 8015 { 8016 basic_block bb; 8017 8018 if (!is_gimple_debug (ustmt)) 8019 continue; 8020 8021 bb = gimple_bb (ustmt); 8022 8023 if (!flow_bb_inside_loop_p (loop, bb)) 8024 { 8025 if (gimple_debug_bind_p (ustmt)) 8026 { 8027 if (dump_enabled_p ()) 8028 dump_printf_loc (MSG_NOTE, vect_location, 8029 "killing debug use\n"); 8030 8031 gimple_debug_bind_reset_value (ustmt); 8032 update_stmt (ustmt); 8033 } 8034 else 8035 gcc_unreachable (); 8036 } 8037 } 8038 } 8039 } 8040 8041 /* Given loop represented by LOOP_VINFO, return true if computation of 8042 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false 8043 otherwise. */ 8044 8045 static bool 8046 loop_niters_no_overflow (loop_vec_info loop_vinfo) 8047 { 8048 /* Constant case. */ 8049 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) 8050 { 8051 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo); 8052 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo); 8053 8054 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST); 8055 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST); 8056 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters)) 8057 return true; 8058 } 8059 8060 widest_int max; 8061 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 8062 /* Check the upper bound of loop niters. */ 8063 if (get_max_loop_iterations (loop, &max)) 8064 { 8065 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)); 8066 signop sgn = TYPE_SIGN (type); 8067 widest_int type_max = widest_int::from (wi::max_value (type), sgn); 8068 if (max < type_max) 8069 return true; 8070 } 8071 return false; 8072 } 8073 8074 /* Return a mask type with half the number of elements as TYPE. */ 8075 8076 tree 8077 vect_halve_mask_nunits (tree type) 8078 { 8079 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2); 8080 return build_truth_vector_type (nunits, current_vector_size); 8081 } 8082 8083 /* Return a mask type with twice as many elements as TYPE. */ 8084 8085 tree 8086 vect_double_mask_nunits (tree type) 8087 { 8088 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2; 8089 return build_truth_vector_type (nunits, current_vector_size); 8090 } 8091 8092 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to 8093 contain a sequence of NVECTORS masks that each control a vector of type 8094 VECTYPE. */ 8095 8096 void 8097 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks, 8098 unsigned int nvectors, tree vectype) 8099 { 8100 gcc_assert (nvectors != 0); 8101 if (masks->length () < nvectors) 8102 masks->safe_grow_cleared (nvectors); 8103 rgroup_masks *rgm = &(*masks)[nvectors - 1]; 8104 /* The number of scalars per iteration and the number of vectors are 8105 both compile-time constants. */ 8106 unsigned int nscalars_per_iter 8107 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype), 8108 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant (); 8109 if (rgm->max_nscalars_per_iter < nscalars_per_iter) 8110 { 8111 rgm->max_nscalars_per_iter = nscalars_per_iter; 8112 rgm->mask_type = build_same_sized_truth_vector_type (vectype); 8113 } 8114 } 8115 8116 /* Given a complete set of masks MASKS, extract mask number INDEX 8117 for an rgroup that operates on NVECTORS vectors of type VECTYPE, 8118 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI. 8119 8120 See the comment above vec_loop_masks for more details about the mask 8121 arrangement. */ 8122 8123 tree 8124 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks, 8125 unsigned int nvectors, tree vectype, unsigned int index) 8126 { 8127 rgroup_masks *rgm = &(*masks)[nvectors - 1]; 8128 tree mask_type = rgm->mask_type; 8129 8130 /* Populate the rgroup's mask array, if this is the first time we've 8131 used it. */ 8132 if (rgm->masks.is_empty ()) 8133 { 8134 rgm->masks.safe_grow_cleared (nvectors); 8135 for (unsigned int i = 0; i < nvectors; ++i) 8136 { 8137 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask"); 8138 /* Provide a dummy definition until the real one is available. */ 8139 SSA_NAME_DEF_STMT (mask) = gimple_build_nop (); 8140 rgm->masks[i] = mask; 8141 } 8142 } 8143 8144 tree mask = rgm->masks[index]; 8145 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type), 8146 TYPE_VECTOR_SUBPARTS (vectype))) 8147 { 8148 /* A loop mask for data type X can be reused for data type Y 8149 if X has N times more elements than Y and if Y's elements 8150 are N times bigger than X's. In this case each sequence 8151 of N elements in the loop mask will be all-zero or all-one. 8152 We can then view-convert the mask so that each sequence of 8153 N elements is replaced by a single element. */ 8154 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type), 8155 TYPE_VECTOR_SUBPARTS (vectype))); 8156 gimple_seq seq = NULL; 8157 mask_type = build_same_sized_truth_vector_type (vectype); 8158 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask); 8159 if (seq) 8160 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT); 8161 } 8162 return mask; 8163 } 8164 8165 /* Scale profiling counters by estimation for LOOP which is vectorized 8166 by factor VF. */ 8167 8168 static void 8169 scale_profile_for_vect_loop (struct loop *loop, unsigned vf) 8170 { 8171 edge preheader = loop_preheader_edge (loop); 8172 /* Reduce loop iterations by the vectorization factor. */ 8173 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf); 8174 profile_count freq_h = loop->header->count, freq_e = preheader->count (); 8175 8176 if (freq_h.nonzero_p ()) 8177 { 8178 profile_probability p; 8179 8180 /* Avoid dropping loop body profile counter to 0 because of zero count 8181 in loop's preheader. */ 8182 if (!(freq_e == profile_count::zero ())) 8183 freq_e = freq_e.force_nonzero (); 8184 p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h); 8185 scale_loop_frequencies (loop, p); 8186 } 8187 8188 edge exit_e = single_exit (loop); 8189 exit_e->probability = profile_probability::always () 8190 .apply_scale (1, new_est_niter + 1); 8191 8192 edge exit_l = single_pred_edge (loop->latch); 8193 profile_probability prob = exit_l->probability; 8194 exit_l->probability = exit_e->probability.invert (); 8195 if (prob.initialized_p () && exit_l->probability.initialized_p ()) 8196 scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob); 8197 } 8198 8199 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI. 8200 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its 8201 stmt_vec_info. */ 8202 8203 static void 8204 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info, 8205 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store) 8206 { 8207 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 8208 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 8209 8210 if (dump_enabled_p ()) 8211 dump_printf_loc (MSG_NOTE, vect_location, 8212 "------>vectorizing statement: %G", stmt_info->stmt); 8213 8214 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info)) 8215 vect_loop_kill_debug_uses (loop, stmt_info); 8216 8217 if (!STMT_VINFO_RELEVANT_P (stmt_info) 8218 && !STMT_VINFO_LIVE_P (stmt_info)) 8219 return; 8220 8221 if (STMT_VINFO_VECTYPE (stmt_info)) 8222 { 8223 poly_uint64 nunits 8224 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)); 8225 if (!STMT_SLP_TYPE (stmt_info) 8226 && maybe_ne (nunits, vf) 8227 && dump_enabled_p ()) 8228 /* For SLP VF is set according to unrolling factor, and not 8229 to vector size, hence for SLP this print is not valid. */ 8230 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n"); 8231 } 8232 8233 /* Pure SLP statements have already been vectorized. We still need 8234 to apply loop vectorization to hybrid SLP statements. */ 8235 if (PURE_SLP_STMT (stmt_info)) 8236 return; 8237 8238 if (dump_enabled_p ()) 8239 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n"); 8240 8241 if (vect_transform_stmt (stmt_info, gsi, NULL, NULL)) 8242 *seen_store = stmt_info; 8243 } 8244 8245 /* Function vect_transform_loop. 8246 8247 The analysis phase has determined that the loop is vectorizable. 8248 Vectorize the loop - created vectorized stmts to replace the scalar 8249 stmts in the loop, and update the loop exit condition. 8250 Returns scalar epilogue loop if any. */ 8251 8252 struct loop * 8253 vect_transform_loop (loop_vec_info loop_vinfo) 8254 { 8255 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 8256 struct loop *epilogue = NULL; 8257 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); 8258 int nbbs = loop->num_nodes; 8259 int i; 8260 tree niters_vector = NULL_TREE; 8261 tree step_vector = NULL_TREE; 8262 tree niters_vector_mult_vf = NULL_TREE; 8263 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 8264 unsigned int lowest_vf = constant_lower_bound (vf); 8265 gimple *stmt; 8266 bool check_profitability = false; 8267 unsigned int th; 8268 8269 DUMP_VECT_SCOPE ("vec_transform_loop"); 8270 8271 loop_vinfo->shared->check_datarefs (); 8272 8273 /* Use the more conservative vectorization threshold. If the number 8274 of iterations is constant assume the cost check has been performed 8275 by our caller. If the threshold makes all loops profitable that 8276 run at least the (estimated) vectorization factor number of times 8277 checking is pointless, too. */ 8278 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo); 8279 if (th >= vect_vf_for_cost (loop_vinfo) 8280 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) 8281 { 8282 if (dump_enabled_p ()) 8283 dump_printf_loc (MSG_NOTE, vect_location, 8284 "Profitability threshold is %d loop iterations.\n", 8285 th); 8286 check_profitability = true; 8287 } 8288 8289 /* Make sure there exists a single-predecessor exit bb. Do this before 8290 versioning. */ 8291 edge e = single_exit (loop); 8292 if (! single_pred_p (e->dest)) 8293 { 8294 split_loop_exit_edge (e, true); 8295 if (dump_enabled_p ()) 8296 dump_printf (MSG_NOTE, "split exit edge\n"); 8297 } 8298 8299 /* Version the loop first, if required, so the profitability check 8300 comes first. */ 8301 8302 if (LOOP_REQUIRES_VERSIONING (loop_vinfo)) 8303 { 8304 poly_uint64 versioning_threshold 8305 = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo); 8306 if (check_profitability 8307 && ordered_p (poly_uint64 (th), versioning_threshold)) 8308 { 8309 versioning_threshold = ordered_max (poly_uint64 (th), 8310 versioning_threshold); 8311 check_profitability = false; 8312 } 8313 struct loop *sloop 8314 = vect_loop_versioning (loop_vinfo, th, check_profitability, 8315 versioning_threshold); 8316 sloop->force_vectorize = false; 8317 check_profitability = false; 8318 } 8319 8320 /* Make sure there exists a single-predecessor exit bb also on the 8321 scalar loop copy. Do this after versioning but before peeling 8322 so CFG structure is fine for both scalar and if-converted loop 8323 to make slpeel_duplicate_current_defs_from_edges face matched 8324 loop closed PHI nodes on the exit. */ 8325 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)) 8326 { 8327 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)); 8328 if (! single_pred_p (e->dest)) 8329 { 8330 split_loop_exit_edge (e, true); 8331 if (dump_enabled_p ()) 8332 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n"); 8333 } 8334 } 8335 8336 tree niters = vect_build_loop_niters (loop_vinfo); 8337 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters; 8338 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo)); 8339 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo); 8340 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector, 8341 &step_vector, &niters_vector_mult_vf, th, 8342 check_profitability, niters_no_overflow); 8343 8344 if (niters_vector == NULL_TREE) 8345 { 8346 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) 8347 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) 8348 && known_eq (lowest_vf, vf)) 8349 { 8350 niters_vector 8351 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)), 8352 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf); 8353 step_vector = build_one_cst (TREE_TYPE (niters)); 8354 } 8355 else 8356 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector, 8357 &step_vector, niters_no_overflow); 8358 } 8359 8360 /* 1) Make sure the loop header has exactly two entries 8361 2) Make sure we have a preheader basic block. */ 8362 8363 gcc_assert (EDGE_COUNT (loop->header->preds) == 2); 8364 8365 split_edge (loop_preheader_edge (loop)); 8366 8367 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) 8368 && vect_use_loop_mask_for_alignment_p (loop_vinfo)) 8369 /* This will deal with any possible peeling. */ 8370 vect_prepare_for_masked_peels (loop_vinfo); 8371 8372 /* Schedule the SLP instances first, then handle loop vectorization 8373 below. */ 8374 if (!loop_vinfo->slp_instances.is_empty ()) 8375 { 8376 DUMP_VECT_SCOPE ("scheduling SLP instances"); 8377 vect_schedule_slp (loop_vinfo); 8378 } 8379 8380 /* FORNOW: the vectorizer supports only loops which body consist 8381 of one basic block (header + empty latch). When the vectorizer will 8382 support more involved loop forms, the order by which the BBs are 8383 traversed need to be reconsidered. */ 8384 8385 for (i = 0; i < nbbs; i++) 8386 { 8387 basic_block bb = bbs[i]; 8388 stmt_vec_info stmt_info; 8389 8390 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si); 8391 gsi_next (&si)) 8392 { 8393 gphi *phi = si.phi (); 8394 if (dump_enabled_p ()) 8395 dump_printf_loc (MSG_NOTE, vect_location, 8396 "------>vectorizing phi: %G", phi); 8397 stmt_info = loop_vinfo->lookup_stmt (phi); 8398 if (!stmt_info) 8399 continue; 8400 8401 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info)) 8402 vect_loop_kill_debug_uses (loop, stmt_info); 8403 8404 if (!STMT_VINFO_RELEVANT_P (stmt_info) 8405 && !STMT_VINFO_LIVE_P (stmt_info)) 8406 continue; 8407 8408 if (STMT_VINFO_VECTYPE (stmt_info) 8409 && (maybe_ne 8410 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf)) 8411 && dump_enabled_p ()) 8412 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n"); 8413 8414 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def 8415 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def 8416 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle) 8417 && ! PURE_SLP_STMT (stmt_info)) 8418 { 8419 if (dump_enabled_p ()) 8420 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n"); 8421 vect_transform_stmt (stmt_info, NULL, NULL, NULL); 8422 } 8423 } 8424 8425 for (gimple_stmt_iterator si = gsi_start_bb (bb); 8426 !gsi_end_p (si);) 8427 { 8428 stmt = gsi_stmt (si); 8429 /* During vectorization remove existing clobber stmts. */ 8430 if (gimple_clobber_p (stmt)) 8431 { 8432 unlink_stmt_vdef (stmt); 8433 gsi_remove (&si, true); 8434 release_defs (stmt); 8435 } 8436 else 8437 { 8438 stmt_info = loop_vinfo->lookup_stmt (stmt); 8439 8440 /* vector stmts created in the outer-loop during vectorization of 8441 stmts in an inner-loop may not have a stmt_info, and do not 8442 need to be vectorized. */ 8443 stmt_vec_info seen_store = NULL; 8444 if (stmt_info) 8445 { 8446 if (STMT_VINFO_IN_PATTERN_P (stmt_info)) 8447 { 8448 gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info); 8449 for (gimple_stmt_iterator subsi = gsi_start (def_seq); 8450 !gsi_end_p (subsi); gsi_next (&subsi)) 8451 { 8452 stmt_vec_info pat_stmt_info 8453 = loop_vinfo->lookup_stmt (gsi_stmt (subsi)); 8454 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, 8455 &si, &seen_store); 8456 } 8457 stmt_vec_info pat_stmt_info 8458 = STMT_VINFO_RELATED_STMT (stmt_info); 8459 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, &si, 8460 &seen_store); 8461 } 8462 vect_transform_loop_stmt (loop_vinfo, stmt_info, &si, 8463 &seen_store); 8464 } 8465 gsi_next (&si); 8466 if (seen_store) 8467 { 8468 if (STMT_VINFO_GROUPED_ACCESS (seen_store)) 8469 /* Interleaving. If IS_STORE is TRUE, the 8470 vectorization of the interleaving chain was 8471 completed - free all the stores in the chain. */ 8472 vect_remove_stores (DR_GROUP_FIRST_ELEMENT (seen_store)); 8473 else 8474 /* Free the attached stmt_vec_info and remove the stmt. */ 8475 loop_vinfo->remove_stmt (stmt_info); 8476 } 8477 } 8478 } 8479 8480 /* Stub out scalar statements that must not survive vectorization. 8481 Doing this here helps with grouped statements, or statements that 8482 are involved in patterns. */ 8483 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); 8484 !gsi_end_p (gsi); gsi_next (&gsi)) 8485 { 8486 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi)); 8487 if (call && gimple_call_internal_p (call, IFN_MASK_LOAD)) 8488 { 8489 tree lhs = gimple_get_lhs (call); 8490 if (!VECTOR_TYPE_P (TREE_TYPE (lhs))) 8491 { 8492 tree zero = build_zero_cst (TREE_TYPE (lhs)); 8493 gimple *new_stmt = gimple_build_assign (lhs, zero); 8494 gsi_replace (&gsi, new_stmt, true); 8495 } 8496 } 8497 } 8498 } /* BBs in loop */ 8499 8500 /* The vectorization factor is always > 1, so if we use an IV increment of 1. 8501 a zero NITERS becomes a nonzero NITERS_VECTOR. */ 8502 if (integer_onep (step_vector)) 8503 niters_no_overflow = true; 8504 vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector, 8505 niters_vector_mult_vf, !niters_no_overflow); 8506 8507 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo); 8508 scale_profile_for_vect_loop (loop, assumed_vf); 8509 8510 /* True if the final iteration might not handle a full vector's 8511 worth of scalar iterations. */ 8512 bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo); 8513 /* The minimum number of iterations performed by the epilogue. This 8514 is 1 when peeling for gaps because we always need a final scalar 8515 iteration. */ 8516 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0; 8517 /* +1 to convert latch counts to loop iteration counts, 8518 -min_epilogue_iters to remove iterations that cannot be performed 8519 by the vector code. */ 8520 int bias_for_lowest = 1 - min_epilogue_iters; 8521 int bias_for_assumed = bias_for_lowest; 8522 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); 8523 if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) 8524 { 8525 /* When the amount of peeling is known at compile time, the first 8526 iteration will have exactly alignment_npeels active elements. 8527 In the worst case it will have at least one. */ 8528 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1); 8529 bias_for_lowest += lowest_vf - min_first_active; 8530 bias_for_assumed += assumed_vf - min_first_active; 8531 } 8532 /* In these calculations the "- 1" converts loop iteration counts 8533 back to latch counts. */ 8534 if (loop->any_upper_bound) 8535 loop->nb_iterations_upper_bound 8536 = (final_iter_may_be_partial 8537 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest, 8538 lowest_vf) - 1 8539 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest, 8540 lowest_vf) - 1); 8541 if (loop->any_likely_upper_bound) 8542 loop->nb_iterations_likely_upper_bound 8543 = (final_iter_may_be_partial 8544 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound 8545 + bias_for_lowest, lowest_vf) - 1 8546 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound 8547 + bias_for_lowest, lowest_vf) - 1); 8548 if (loop->any_estimate) 8549 loop->nb_iterations_estimate 8550 = (final_iter_may_be_partial 8551 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed, 8552 assumed_vf) - 1 8553 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed, 8554 assumed_vf) - 1); 8555 8556 if (dump_enabled_p ()) 8557 { 8558 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)) 8559 { 8560 dump_printf_loc (MSG_NOTE, vect_location, 8561 "LOOP VECTORIZED\n"); 8562 if (loop->inner) 8563 dump_printf_loc (MSG_NOTE, vect_location, 8564 "OUTER LOOP VECTORIZED\n"); 8565 dump_printf (MSG_NOTE, "\n"); 8566 } 8567 else 8568 { 8569 dump_printf_loc (MSG_NOTE, vect_location, 8570 "LOOP EPILOGUE VECTORIZED (VS="); 8571 dump_dec (MSG_NOTE, current_vector_size); 8572 dump_printf (MSG_NOTE, ")\n"); 8573 } 8574 } 8575 8576 /* Loops vectorized with a variable factor won't benefit from 8577 unrolling/peeling. */ 8578 if (!vf.is_constant ()) 8579 { 8580 loop->unroll = 1; 8581 if (dump_enabled_p ()) 8582 dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to" 8583 " variable-length vectorization factor\n"); 8584 } 8585 /* Free SLP instances here because otherwise stmt reference counting 8586 won't work. */ 8587 slp_instance instance; 8588 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance) 8589 vect_free_slp_instance (instance, true); 8590 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release (); 8591 /* Clear-up safelen field since its value is invalid after vectorization 8592 since vectorized loop can have loop-carried dependencies. */ 8593 loop->safelen = 0; 8594 8595 /* Don't vectorize epilogue for epilogue. */ 8596 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)) 8597 epilogue = NULL; 8598 8599 if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK)) 8600 epilogue = NULL; 8601 8602 if (epilogue) 8603 { 8604 auto_vector_sizes vector_sizes; 8605 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes); 8606 unsigned int next_size = 0; 8607 8608 /* Note LOOP_VINFO_NITERS_KNOWN_P and LOOP_VINFO_INT_NITERS work 8609 on niters already ajusted for the iterations of the prologue. */ 8610 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) 8611 && known_eq (vf, lowest_vf)) 8612 { 8613 unsigned HOST_WIDE_INT eiters 8614 = (LOOP_VINFO_INT_NITERS (loop_vinfo) 8615 - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)); 8616 eiters 8617 = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo); 8618 epilogue->nb_iterations_upper_bound = eiters - 1; 8619 epilogue->any_upper_bound = true; 8620 8621 unsigned int ratio; 8622 while (next_size < vector_sizes.length () 8623 && !(constant_multiple_p (current_vector_size, 8624 vector_sizes[next_size], &ratio) 8625 && eiters >= lowest_vf / ratio)) 8626 next_size += 1; 8627 } 8628 else 8629 while (next_size < vector_sizes.length () 8630 && maybe_lt (current_vector_size, vector_sizes[next_size])) 8631 next_size += 1; 8632 8633 if (next_size == vector_sizes.length ()) 8634 epilogue = NULL; 8635 } 8636 8637 if (epilogue) 8638 { 8639 epilogue->force_vectorize = loop->force_vectorize; 8640 epilogue->safelen = loop->safelen; 8641 epilogue->dont_vectorize = false; 8642 8643 /* We may need to if-convert epilogue to vectorize it. */ 8644 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)) 8645 tree_if_conversion (epilogue); 8646 } 8647 8648 return epilogue; 8649 } 8650 8651 /* The code below is trying to perform simple optimization - revert 8652 if-conversion for masked stores, i.e. if the mask of a store is zero 8653 do not perform it and all stored value producers also if possible. 8654 For example, 8655 for (i=0; i<n; i++) 8656 if (c[i]) 8657 { 8658 p1[i] += 1; 8659 p2[i] = p3[i] +2; 8660 } 8661 this transformation will produce the following semi-hammock: 8662 8663 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 }) 8664 { 8665 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165); 8666 vect__12.22_172 = vect__11.19_170 + vect_cst__171; 8667 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172); 8668 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165); 8669 vect__19.28_184 = vect__18.25_182 + vect_cst__183; 8670 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184); 8671 } 8672 */ 8673 8674 void 8675 optimize_mask_stores (struct loop *loop) 8676 { 8677 basic_block *bbs = get_loop_body (loop); 8678 unsigned nbbs = loop->num_nodes; 8679 unsigned i; 8680 basic_block bb; 8681 struct loop *bb_loop; 8682 gimple_stmt_iterator gsi; 8683 gimple *stmt; 8684 auto_vec<gimple *> worklist; 8685 auto_purge_vect_location sentinel; 8686 8687 vect_location = find_loop_location (loop); 8688 /* Pick up all masked stores in loop if any. */ 8689 for (i = 0; i < nbbs; i++) 8690 { 8691 bb = bbs[i]; 8692 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); 8693 gsi_next (&gsi)) 8694 { 8695 stmt = gsi_stmt (gsi); 8696 if (gimple_call_internal_p (stmt, IFN_MASK_STORE)) 8697 worklist.safe_push (stmt); 8698 } 8699 } 8700 8701 free (bbs); 8702 if (worklist.is_empty ()) 8703 return; 8704 8705 /* Loop has masked stores. */ 8706 while (!worklist.is_empty ()) 8707 { 8708 gimple *last, *last_store; 8709 edge e, efalse; 8710 tree mask; 8711 basic_block store_bb, join_bb; 8712 gimple_stmt_iterator gsi_to; 8713 tree vdef, new_vdef; 8714 gphi *phi; 8715 tree vectype; 8716 tree zero; 8717 8718 last = worklist.pop (); 8719 mask = gimple_call_arg (last, 2); 8720 bb = gimple_bb (last); 8721 /* Create then_bb and if-then structure in CFG, then_bb belongs to 8722 the same loop as if_bb. It could be different to LOOP when two 8723 level loop-nest is vectorized and mask_store belongs to the inner 8724 one. */ 8725 e = split_block (bb, last); 8726 bb_loop = bb->loop_father; 8727 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop)); 8728 join_bb = e->dest; 8729 store_bb = create_empty_bb (bb); 8730 add_bb_to_loop (store_bb, bb_loop); 8731 e->flags = EDGE_TRUE_VALUE; 8732 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE); 8733 /* Put STORE_BB to likely part. */ 8734 efalse->probability = profile_probability::unlikely (); 8735 store_bb->count = efalse->count (); 8736 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU); 8737 if (dom_info_available_p (CDI_DOMINATORS)) 8738 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb); 8739 if (dump_enabled_p ()) 8740 dump_printf_loc (MSG_NOTE, vect_location, 8741 "Create new block %d to sink mask stores.", 8742 store_bb->index); 8743 /* Create vector comparison with boolean result. */ 8744 vectype = TREE_TYPE (mask); 8745 zero = build_zero_cst (vectype); 8746 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE); 8747 gsi = gsi_last_bb (bb); 8748 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT); 8749 /* Create new PHI node for vdef of the last masked store: 8750 .MEM_2 = VDEF <.MEM_1> 8751 will be converted to 8752 .MEM.3 = VDEF <.MEM_1> 8753 and new PHI node will be created in join bb 8754 .MEM_2 = PHI <.MEM_1, .MEM_3> 8755 */ 8756 vdef = gimple_vdef (last); 8757 new_vdef = make_ssa_name (gimple_vop (cfun), last); 8758 gimple_set_vdef (last, new_vdef); 8759 phi = create_phi_node (vdef, join_bb); 8760 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION); 8761 8762 /* Put all masked stores with the same mask to STORE_BB if possible. */ 8763 while (true) 8764 { 8765 gimple_stmt_iterator gsi_from; 8766 gimple *stmt1 = NULL; 8767 8768 /* Move masked store to STORE_BB. */ 8769 last_store = last; 8770 gsi = gsi_for_stmt (last); 8771 gsi_from = gsi; 8772 /* Shift GSI to the previous stmt for further traversal. */ 8773 gsi_prev (&gsi); 8774 gsi_to = gsi_start_bb (store_bb); 8775 gsi_move_before (&gsi_from, &gsi_to); 8776 /* Setup GSI_TO to the non-empty block start. */ 8777 gsi_to = gsi_start_bb (store_bb); 8778 if (dump_enabled_p ()) 8779 dump_printf_loc (MSG_NOTE, vect_location, 8780 "Move stmt to created bb\n%G", last); 8781 /* Move all stored value producers if possible. */ 8782 while (!gsi_end_p (gsi)) 8783 { 8784 tree lhs; 8785 imm_use_iterator imm_iter; 8786 use_operand_p use_p; 8787 bool res; 8788 8789 /* Skip debug statements. */ 8790 if (is_gimple_debug (gsi_stmt (gsi))) 8791 { 8792 gsi_prev (&gsi); 8793 continue; 8794 } 8795 stmt1 = gsi_stmt (gsi); 8796 /* Do not consider statements writing to memory or having 8797 volatile operand. */ 8798 if (gimple_vdef (stmt1) 8799 || gimple_has_volatile_ops (stmt1)) 8800 break; 8801 gsi_from = gsi; 8802 gsi_prev (&gsi); 8803 lhs = gimple_get_lhs (stmt1); 8804 if (!lhs) 8805 break; 8806 8807 /* LHS of vectorized stmt must be SSA_NAME. */ 8808 if (TREE_CODE (lhs) != SSA_NAME) 8809 break; 8810 8811 if (!VECTOR_TYPE_P (TREE_TYPE (lhs))) 8812 { 8813 /* Remove dead scalar statement. */ 8814 if (has_zero_uses (lhs)) 8815 { 8816 gsi_remove (&gsi_from, true); 8817 continue; 8818 } 8819 } 8820 8821 /* Check that LHS does not have uses outside of STORE_BB. */ 8822 res = true; 8823 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs) 8824 { 8825 gimple *use_stmt; 8826 use_stmt = USE_STMT (use_p); 8827 if (is_gimple_debug (use_stmt)) 8828 continue; 8829 if (gimple_bb (use_stmt) != store_bb) 8830 { 8831 res = false; 8832 break; 8833 } 8834 } 8835 if (!res) 8836 break; 8837 8838 if (gimple_vuse (stmt1) 8839 && gimple_vuse (stmt1) != gimple_vuse (last_store)) 8840 break; 8841 8842 /* Can move STMT1 to STORE_BB. */ 8843 if (dump_enabled_p ()) 8844 dump_printf_loc (MSG_NOTE, vect_location, 8845 "Move stmt to created bb\n%G", stmt1); 8846 gsi_move_before (&gsi_from, &gsi_to); 8847 /* Shift GSI_TO for further insertion. */ 8848 gsi_prev (&gsi_to); 8849 } 8850 /* Put other masked stores with the same mask to STORE_BB. */ 8851 if (worklist.is_empty () 8852 || gimple_call_arg (worklist.last (), 2) != mask 8853 || worklist.last () != stmt1) 8854 break; 8855 last = worklist.pop (); 8856 } 8857 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION); 8858 } 8859 } 8860