1 /* Loop Vectorization 2 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 3 Free Software Foundation, Inc. 4 Contributed by Dorit Naishlos <dorit@il.ibm.com> and 5 Ira Rosen <irar@il.ibm.com> 6 7 This file is part of GCC. 8 9 GCC is free software; you can redistribute it and/or modify it under 10 the terms of the GNU General Public License as published by the Free 11 Software Foundation; either version 3, or (at your option) any later 12 version. 13 14 GCC is distributed in the hope that it will be useful, but WITHOUT ANY 15 WARRANTY; without even the implied warranty of MERCHANTABILITY or 16 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 17 for more details. 18 19 You should have received a copy of the GNU General Public License 20 along with GCC; see the file COPYING3. If not see 21 <http://www.gnu.org/licenses/>. */ 22 23 #include "config.h" 24 #include "system.h" 25 #include "coretypes.h" 26 #include "tm.h" 27 #include "ggc.h" 28 #include "tree.h" 29 #include "basic-block.h" 30 #include "diagnostic.h" 31 #include "tree-flow.h" 32 #include "tree-dump.h" 33 #include "cfgloop.h" 34 #include "cfglayout.h" 35 #include "expr.h" 36 #include "recog.h" 37 #include "optabs.h" 38 #include "params.h" 39 #include "toplev.h" 40 #include "tree-chrec.h" 41 #include "tree-scalar-evolution.h" 42 #include "tree-vectorizer.h" 43 44 /* Loop Vectorization Pass. 45 46 This pass tries to vectorize loops. 47 48 For example, the vectorizer transforms the following simple loop: 49 50 short a[N]; short b[N]; short c[N]; int i; 51 52 for (i=0; i<N; i++){ 53 a[i] = b[i] + c[i]; 54 } 55 56 as if it was manually vectorized by rewriting the source code into: 57 58 typedef int __attribute__((mode(V8HI))) v8hi; 59 short a[N]; short b[N]; short c[N]; int i; 60 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c; 61 v8hi va, vb, vc; 62 63 for (i=0; i<N/8; i++){ 64 vb = pb[i]; 65 vc = pc[i]; 66 va = vb + vc; 67 pa[i] = va; 68 } 69 70 The main entry to this pass is vectorize_loops(), in which 71 the vectorizer applies a set of analyses on a given set of loops, 72 followed by the actual vectorization transformation for the loops that 73 had successfully passed the analysis phase. 74 Throughout this pass we make a distinction between two types of 75 data: scalars (which are represented by SSA_NAMES), and memory references 76 ("data-refs"). These two types of data require different handling both 77 during analysis and transformation. The types of data-refs that the 78 vectorizer currently supports are ARRAY_REFS which base is an array DECL 79 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer 80 accesses are required to have a simple (consecutive) access pattern. 81 82 Analysis phase: 83 =============== 84 The driver for the analysis phase is vect_analyze_loop(). 85 It applies a set of analyses, some of which rely on the scalar evolution 86 analyzer (scev) developed by Sebastian Pop. 87 88 During the analysis phase the vectorizer records some information 89 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the 90 loop, as well as general information about the loop as a whole, which is 91 recorded in a "loop_vec_info" struct attached to each loop. 92 93 Transformation phase: 94 ===================== 95 The loop transformation phase scans all the stmts in the loop, and 96 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in 97 the loop that needs to be vectorized. It inserts the vector code sequence 98 just before the scalar stmt S, and records a pointer to the vector code 99 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct 100 attached to S). This pointer will be used for the vectorization of following 101 stmts which use the def of stmt S. Stmt S is removed if it writes to memory; 102 otherwise, we rely on dead code elimination for removing it. 103 104 For example, say stmt S1 was vectorized into stmt VS1: 105 106 VS1: vb = px[i]; 107 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1 108 S2: a = b; 109 110 To vectorize stmt S2, the vectorizer first finds the stmt that defines 111 the operand 'b' (S1), and gets the relevant vector def 'vb' from the 112 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The 113 resulting sequence would be: 114 115 VS1: vb = px[i]; 116 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1 117 VS2: va = vb; 118 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2 119 120 Operands that are not SSA_NAMEs, are data-refs that appear in 121 load/store operations (like 'x[i]' in S1), and are handled differently. 122 123 Target modeling: 124 ================= 125 Currently the only target specific information that is used is the 126 size of the vector (in bytes) - "UNITS_PER_SIMD_WORD". Targets that can 127 support different sizes of vectors, for now will need to specify one value 128 for "UNITS_PER_SIMD_WORD". More flexibility will be added in the future. 129 130 Since we only vectorize operations which vector form can be 131 expressed using existing tree codes, to verify that an operation is 132 supported, the vectorizer checks the relevant optab at the relevant 133 machine_mode (e.g, optab_handler (add_optab, V8HImode)->insn_code). If 134 the value found is CODE_FOR_nothing, then there's no target support, and 135 we can't vectorize the stmt. 136 137 For additional information on this project see: 138 http://gcc.gnu.org/projects/tree-ssa/vectorization.html 139 */ 140 141 /* Function vect_determine_vectorization_factor 142 143 Determine the vectorization factor (VF). VF is the number of data elements 144 that are operated upon in parallel in a single iteration of the vectorized 145 loop. For example, when vectorizing a loop that operates on 4byte elements, 146 on a target with vector size (VS) 16byte, the VF is set to 4, since 4 147 elements can fit in a single vector register. 148 149 We currently support vectorization of loops in which all types operated upon 150 are of the same size. Therefore this function currently sets VF according to 151 the size of the types operated upon, and fails if there are multiple sizes 152 in the loop. 153 154 VF is also the factor by which the loop iterations are strip-mined, e.g.: 155 original loop: 156 for (i=0; i<N; i++){ 157 a[i] = b[i] + c[i]; 158 } 159 160 vectorized loop: 161 for (i=0; i<N; i+=VF){ 162 a[i:VF] = b[i:VF] + c[i:VF]; 163 } 164 */ 165 166 static bool 167 vect_determine_vectorization_factor (loop_vec_info loop_vinfo) 168 { 169 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 170 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); 171 int nbbs = loop->num_nodes; 172 gimple_stmt_iterator si; 173 unsigned int vectorization_factor = 0; 174 tree scalar_type; 175 gimple phi; 176 tree vectype; 177 unsigned int nunits; 178 stmt_vec_info stmt_info; 179 int i; 180 HOST_WIDE_INT dummy; 181 182 if (vect_print_dump_info (REPORT_DETAILS)) 183 fprintf (vect_dump, "=== vect_determine_vectorization_factor ==="); 184 185 for (i = 0; i < nbbs; i++) 186 { 187 basic_block bb = bbs[i]; 188 189 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si)) 190 { 191 phi = gsi_stmt (si); 192 stmt_info = vinfo_for_stmt (phi); 193 if (vect_print_dump_info (REPORT_DETAILS)) 194 { 195 fprintf (vect_dump, "==> examining phi: "); 196 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM); 197 } 198 199 gcc_assert (stmt_info); 200 201 if (STMT_VINFO_RELEVANT_P (stmt_info)) 202 { 203 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info)); 204 scalar_type = TREE_TYPE (PHI_RESULT (phi)); 205 206 if (vect_print_dump_info (REPORT_DETAILS)) 207 { 208 fprintf (vect_dump, "get vectype for scalar type: "); 209 print_generic_expr (vect_dump, scalar_type, TDF_SLIM); 210 } 211 212 vectype = get_vectype_for_scalar_type (scalar_type); 213 if (!vectype) 214 { 215 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS)) 216 { 217 fprintf (vect_dump, 218 "not vectorized: unsupported data-type "); 219 print_generic_expr (vect_dump, scalar_type, TDF_SLIM); 220 } 221 return false; 222 } 223 STMT_VINFO_VECTYPE (stmt_info) = vectype; 224 225 if (vect_print_dump_info (REPORT_DETAILS)) 226 { 227 fprintf (vect_dump, "vectype: "); 228 print_generic_expr (vect_dump, vectype, TDF_SLIM); 229 } 230 231 nunits = TYPE_VECTOR_SUBPARTS (vectype); 232 if (vect_print_dump_info (REPORT_DETAILS)) 233 fprintf (vect_dump, "nunits = %d", nunits); 234 235 if (!vectorization_factor 236 || (nunits > vectorization_factor)) 237 vectorization_factor = nunits; 238 } 239 } 240 241 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si)) 242 { 243 gimple stmt = gsi_stmt (si); 244 stmt_info = vinfo_for_stmt (stmt); 245 246 if (vect_print_dump_info (REPORT_DETAILS)) 247 { 248 fprintf (vect_dump, "==> examining statement: "); 249 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); 250 } 251 252 gcc_assert (stmt_info); 253 254 /* skip stmts which do not need to be vectorized. */ 255 if (!STMT_VINFO_RELEVANT_P (stmt_info) 256 && !STMT_VINFO_LIVE_P (stmt_info)) 257 { 258 if (vect_print_dump_info (REPORT_DETAILS)) 259 fprintf (vect_dump, "skip."); 260 continue; 261 } 262 263 if (gimple_get_lhs (stmt) == NULL_TREE) 264 { 265 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS)) 266 { 267 fprintf (vect_dump, "not vectorized: irregular stmt."); 268 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); 269 } 270 return false; 271 } 272 273 if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt)))) 274 { 275 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS)) 276 { 277 fprintf (vect_dump, "not vectorized: vector stmt in loop:"); 278 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); 279 } 280 return false; 281 } 282 283 if (STMT_VINFO_VECTYPE (stmt_info)) 284 { 285 /* The only case when a vectype had been already set is for stmts 286 that contain a dataref, or for "pattern-stmts" (stmts generated 287 by the vectorizer to represent/replace a certain idiom). */ 288 gcc_assert (STMT_VINFO_DATA_REF (stmt_info) 289 || is_pattern_stmt_p (stmt_info)); 290 vectype = STMT_VINFO_VECTYPE (stmt_info); 291 } 292 else 293 { 294 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info) 295 && !is_pattern_stmt_p (stmt_info)); 296 297 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy, 298 &dummy); 299 if (vect_print_dump_info (REPORT_DETAILS)) 300 { 301 fprintf (vect_dump, "get vectype for scalar type: "); 302 print_generic_expr (vect_dump, scalar_type, TDF_SLIM); 303 } 304 305 vectype = get_vectype_for_scalar_type (scalar_type); 306 if (!vectype) 307 { 308 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS)) 309 { 310 fprintf (vect_dump, 311 "not vectorized: unsupported data-type "); 312 print_generic_expr (vect_dump, scalar_type, TDF_SLIM); 313 } 314 return false; 315 } 316 STMT_VINFO_VECTYPE (stmt_info) = vectype; 317 } 318 319 if (vect_print_dump_info (REPORT_DETAILS)) 320 { 321 fprintf (vect_dump, "vectype: "); 322 print_generic_expr (vect_dump, vectype, TDF_SLIM); 323 } 324 325 nunits = TYPE_VECTOR_SUBPARTS (vectype); 326 if (vect_print_dump_info (REPORT_DETAILS)) 327 fprintf (vect_dump, "nunits = %d", nunits); 328 329 if (!vectorization_factor 330 || (nunits > vectorization_factor)) 331 vectorization_factor = nunits; 332 333 } 334 } 335 336 /* TODO: Analyze cost. Decide if worth while to vectorize. */ 337 if (vect_print_dump_info (REPORT_DETAILS)) 338 fprintf (vect_dump, "vectorization factor = %d", vectorization_factor); 339 if (vectorization_factor <= 1) 340 { 341 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS)) 342 fprintf (vect_dump, "not vectorized: unsupported data-type"); 343 return false; 344 } 345 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor; 346 347 return true; 348 } 349 350 351 /* Function vect_is_simple_iv_evolution. 352 353 FORNOW: A simple evolution of an induction variables in the loop is 354 considered a polynomial evolution with constant step. */ 355 356 static bool 357 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init, 358 tree * step) 359 { 360 tree init_expr; 361 tree step_expr; 362 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb); 363 364 /* When there is no evolution in this loop, the evolution function 365 is not "simple". */ 366 if (evolution_part == NULL_TREE) 367 return false; 368 369 /* When the evolution is a polynomial of degree >= 2 370 the evolution function is not "simple". */ 371 if (tree_is_chrec (evolution_part)) 372 return false; 373 374 step_expr = evolution_part; 375 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb)); 376 377 if (vect_print_dump_info (REPORT_DETAILS)) 378 { 379 fprintf (vect_dump, "step: "); 380 print_generic_expr (vect_dump, step_expr, TDF_SLIM); 381 fprintf (vect_dump, ", init: "); 382 print_generic_expr (vect_dump, init_expr, TDF_SLIM); 383 } 384 385 *init = init_expr; 386 *step = step_expr; 387 388 if (TREE_CODE (step_expr) != INTEGER_CST) 389 { 390 if (vect_print_dump_info (REPORT_DETAILS)) 391 fprintf (vect_dump, "step unknown."); 392 return false; 393 } 394 395 return true; 396 } 397 398 /* Function vect_analyze_scalar_cycles_1. 399 400 Examine the cross iteration def-use cycles of scalar variables 401 in LOOP. LOOP_VINFO represents the loop that is now being 402 considered for vectorization (can be LOOP, or an outer-loop 403 enclosing LOOP). */ 404 405 static void 406 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop) 407 { 408 basic_block bb = loop->header; 409 tree dumy; 410 VEC(gimple,heap) *worklist = VEC_alloc (gimple, heap, 64); 411 gimple_stmt_iterator gsi; 412 bool double_reduc; 413 414 if (vect_print_dump_info (REPORT_DETAILS)) 415 fprintf (vect_dump, "=== vect_analyze_scalar_cycles ==="); 416 417 /* First - identify all inductions. Reduction detection assumes that all the 418 inductions have been identified, therefore, this order must not be 419 changed. */ 420 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi)) 421 { 422 gimple phi = gsi_stmt (gsi); 423 tree access_fn = NULL; 424 tree def = PHI_RESULT (phi); 425 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi); 426 427 if (vect_print_dump_info (REPORT_DETAILS)) 428 { 429 fprintf (vect_dump, "Analyze phi: "); 430 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM); 431 } 432 433 /* Skip virtual phi's. The data dependences that are associated with 434 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */ 435 if (!is_gimple_reg (SSA_NAME_VAR (def))) 436 continue; 437 438 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type; 439 440 /* Analyze the evolution function. */ 441 access_fn = analyze_scalar_evolution (loop, def); 442 if (access_fn && vect_print_dump_info (REPORT_DETAILS)) 443 { 444 fprintf (vect_dump, "Access function of PHI: "); 445 print_generic_expr (vect_dump, access_fn, TDF_SLIM); 446 } 447 448 if (!access_fn 449 || !vect_is_simple_iv_evolution (loop->num, access_fn, &dumy, &dumy)) 450 { 451 VEC_safe_push (gimple, heap, worklist, phi); 452 continue; 453 } 454 455 if (vect_print_dump_info (REPORT_DETAILS)) 456 fprintf (vect_dump, "Detected induction."); 457 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def; 458 } 459 460 461 /* Second - identify all reductions and nested cycles. */ 462 while (VEC_length (gimple, worklist) > 0) 463 { 464 gimple phi = VEC_pop (gimple, worklist); 465 tree def = PHI_RESULT (phi); 466 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi); 467 gimple reduc_stmt; 468 bool nested_cycle; 469 470 if (vect_print_dump_info (REPORT_DETAILS)) 471 { 472 fprintf (vect_dump, "Analyze phi: "); 473 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM); 474 } 475 476 gcc_assert (is_gimple_reg (SSA_NAME_VAR (def))); 477 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type); 478 479 nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo)); 480 reduc_stmt = vect_is_simple_reduction (loop_vinfo, phi, !nested_cycle, 481 &double_reduc); 482 if (reduc_stmt) 483 { 484 if (double_reduc) 485 { 486 if (vect_print_dump_info (REPORT_DETAILS)) 487 fprintf (vect_dump, "Detected double reduction."); 488 489 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def; 490 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) = 491 vect_double_reduction_def; 492 } 493 else 494 { 495 if (nested_cycle) 496 { 497 if (vect_print_dump_info (REPORT_DETAILS)) 498 fprintf (vect_dump, "Detected vectorizable nested cycle."); 499 500 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle; 501 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) = 502 vect_nested_cycle; 503 } 504 else 505 { 506 if (vect_print_dump_info (REPORT_DETAILS)) 507 fprintf (vect_dump, "Detected reduction."); 508 509 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def; 510 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) = 511 vect_reduction_def; 512 } 513 } 514 } 515 else 516 if (vect_print_dump_info (REPORT_DETAILS)) 517 fprintf (vect_dump, "Unknown def-use cycle pattern."); 518 } 519 520 VEC_free (gimple, heap, worklist); 521 } 522 523 524 /* Function vect_analyze_scalar_cycles. 525 526 Examine the cross iteration def-use cycles of scalar variables, by 527 analyzing the loop-header PHIs of scalar variables; Classify each 528 cycle as one of the following: invariant, induction, reduction, unknown. 529 We do that for the loop represented by LOOP_VINFO, and also to its 530 inner-loop, if exists. 531 Examples for scalar cycles: 532 533 Example1: reduction: 534 535 loop1: 536 for (i=0; i<N; i++) 537 sum += a[i]; 538 539 Example2: induction: 540 541 loop2: 542 for (i=0; i<N; i++) 543 a[i] = i; */ 544 545 static void 546 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo) 547 { 548 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 549 550 vect_analyze_scalar_cycles_1 (loop_vinfo, loop); 551 552 /* When vectorizing an outer-loop, the inner-loop is executed sequentially. 553 Reductions in such inner-loop therefore have different properties than 554 the reductions in the nest that gets vectorized: 555 1. When vectorized, they are executed in the same order as in the original 556 scalar loop, so we can't change the order of computation when 557 vectorizing them. 558 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the 559 current checks are too strict. */ 560 561 if (loop->inner) 562 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner); 563 } 564 565 /* Function vect_get_loop_niters. 566 567 Determine how many iterations the loop is executed. 568 If an expression that represents the number of iterations 569 can be constructed, place it in NUMBER_OF_ITERATIONS. 570 Return the loop exit condition. */ 571 572 static gimple 573 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations) 574 { 575 tree niters; 576 577 if (vect_print_dump_info (REPORT_DETAILS)) 578 fprintf (vect_dump, "=== get_loop_niters ==="); 579 580 niters = number_of_exit_cond_executions (loop); 581 582 if (niters != NULL_TREE 583 && niters != chrec_dont_know) 584 { 585 *number_of_iterations = niters; 586 587 if (vect_print_dump_info (REPORT_DETAILS)) 588 { 589 fprintf (vect_dump, "==> get_loop_niters:" ); 590 print_generic_expr (vect_dump, *number_of_iterations, TDF_SLIM); 591 } 592 } 593 594 return get_loop_exit_condition (loop); 595 } 596 597 598 /* Function bb_in_loop_p 599 600 Used as predicate for dfs order traversal of the loop bbs. */ 601 602 static bool 603 bb_in_loop_p (const_basic_block bb, const void *data) 604 { 605 const struct loop *const loop = (const struct loop *)data; 606 if (flow_bb_inside_loop_p (loop, bb)) 607 return true; 608 return false; 609 } 610 611 612 /* Function new_loop_vec_info. 613 614 Create and initialize a new loop_vec_info struct for LOOP, as well as 615 stmt_vec_info structs for all the stmts in LOOP. */ 616 617 static loop_vec_info 618 new_loop_vec_info (struct loop *loop) 619 { 620 loop_vec_info res; 621 basic_block *bbs; 622 gimple_stmt_iterator si; 623 unsigned int i, nbbs; 624 625 res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info)); 626 LOOP_VINFO_LOOP (res) = loop; 627 628 bbs = get_loop_body (loop); 629 630 /* Create/Update stmt_info for all stmts in the loop. */ 631 for (i = 0; i < loop->num_nodes; i++) 632 { 633 basic_block bb = bbs[i]; 634 635 /* BBs in a nested inner-loop will have been already processed (because 636 we will have called vect_analyze_loop_form for any nested inner-loop). 637 Therefore, for stmts in an inner-loop we just want to update the 638 STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new 639 loop_info of the outer-loop we are currently considering to vectorize 640 (instead of the loop_info of the inner-loop). 641 For stmts in other BBs we need to create a stmt_info from scratch. */ 642 if (bb->loop_father != loop) 643 { 644 /* Inner-loop bb. */ 645 gcc_assert (loop->inner && bb->loop_father == loop->inner); 646 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si)) 647 { 648 gimple phi = gsi_stmt (si); 649 stmt_vec_info stmt_info = vinfo_for_stmt (phi); 650 loop_vec_info inner_loop_vinfo = 651 STMT_VINFO_LOOP_VINFO (stmt_info); 652 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo)); 653 STMT_VINFO_LOOP_VINFO (stmt_info) = res; 654 } 655 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si)) 656 { 657 gimple stmt = gsi_stmt (si); 658 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 659 loop_vec_info inner_loop_vinfo = 660 STMT_VINFO_LOOP_VINFO (stmt_info); 661 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo)); 662 STMT_VINFO_LOOP_VINFO (stmt_info) = res; 663 } 664 } 665 else 666 { 667 /* bb in current nest. */ 668 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si)) 669 { 670 gimple phi = gsi_stmt (si); 671 gimple_set_uid (phi, 0); 672 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL)); 673 } 674 675 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si)) 676 { 677 gimple stmt = gsi_stmt (si); 678 gimple_set_uid (stmt, 0); 679 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL)); 680 } 681 } 682 } 683 684 /* CHECKME: We want to visit all BBs before their successors (except for 685 latch blocks, for which this assertion wouldn't hold). In the simple 686 case of the loop forms we allow, a dfs order of the BBs would the same 687 as reversed postorder traversal, so we are safe. */ 688 689 free (bbs); 690 bbs = XCNEWVEC (basic_block, loop->num_nodes); 691 nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p, 692 bbs, loop->num_nodes, loop); 693 gcc_assert (nbbs == loop->num_nodes); 694 695 LOOP_VINFO_BBS (res) = bbs; 696 LOOP_VINFO_NITERS (res) = NULL; 697 LOOP_VINFO_NITERS_UNCHANGED (res) = NULL; 698 LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0; 699 LOOP_VINFO_VECTORIZABLE_P (res) = 0; 700 LOOP_PEELING_FOR_ALIGNMENT (res) = 0; 701 LOOP_VINFO_VECT_FACTOR (res) = 0; 702 LOOP_VINFO_DATAREFS (res) = VEC_alloc (data_reference_p, heap, 10); 703 LOOP_VINFO_DDRS (res) = VEC_alloc (ddr_p, heap, 10 * 10); 704 LOOP_VINFO_UNALIGNED_DR (res) = NULL; 705 LOOP_VINFO_MAY_MISALIGN_STMTS (res) = 706 VEC_alloc (gimple, heap, 707 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS)); 708 LOOP_VINFO_MAY_ALIAS_DDRS (res) = 709 VEC_alloc (ddr_p, heap, 710 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS)); 711 LOOP_VINFO_STRIDED_STORES (res) = VEC_alloc (gimple, heap, 10); 712 LOOP_VINFO_SLP_INSTANCES (res) = VEC_alloc (slp_instance, heap, 10); 713 LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1; 714 LOOP_VINFO_PEELING_FOR_GAPS (res) = false; 715 716 return res; 717 } 718 719 720 /* Function destroy_loop_vec_info. 721 722 Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the 723 stmts in the loop. */ 724 725 void 726 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts) 727 { 728 struct loop *loop; 729 basic_block *bbs; 730 int nbbs; 731 gimple_stmt_iterator si; 732 int j; 733 VEC (slp_instance, heap) *slp_instances; 734 slp_instance instance; 735 736 if (!loop_vinfo) 737 return; 738 739 loop = LOOP_VINFO_LOOP (loop_vinfo); 740 741 bbs = LOOP_VINFO_BBS (loop_vinfo); 742 nbbs = loop->num_nodes; 743 744 if (!clean_stmts) 745 { 746 free (LOOP_VINFO_BBS (loop_vinfo)); 747 free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo)); 748 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo)); 749 VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)); 750 751 free (loop_vinfo); 752 loop->aux = NULL; 753 return; 754 } 755 756 for (j = 0; j < nbbs; j++) 757 { 758 basic_block bb = bbs[j]; 759 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si)) 760 free_stmt_vec_info (gsi_stmt (si)); 761 762 for (si = gsi_start_bb (bb); !gsi_end_p (si); ) 763 { 764 gimple stmt = gsi_stmt (si); 765 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 766 767 if (stmt_info) 768 { 769 /* Check if this is a "pattern stmt" (introduced by the 770 vectorizer during the pattern recognition pass). */ 771 bool remove_stmt_p = false; 772 gimple orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info); 773 if (orig_stmt) 774 { 775 stmt_vec_info orig_stmt_info = vinfo_for_stmt (orig_stmt); 776 if (orig_stmt_info 777 && STMT_VINFO_IN_PATTERN_P (orig_stmt_info)) 778 remove_stmt_p = true; 779 } 780 781 /* Free stmt_vec_info. */ 782 free_stmt_vec_info (stmt); 783 784 /* Remove dead "pattern stmts". */ 785 if (remove_stmt_p) 786 gsi_remove (&si, true); 787 } 788 gsi_next (&si); 789 } 790 } 791 792 free (LOOP_VINFO_BBS (loop_vinfo)); 793 free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo)); 794 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo)); 795 VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)); 796 VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)); 797 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo); 798 for (j = 0; VEC_iterate (slp_instance, slp_instances, j, instance); j++) 799 vect_free_slp_instance (instance); 800 801 VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo)); 802 VEC_free (gimple, heap, LOOP_VINFO_STRIDED_STORES (loop_vinfo)); 803 804 free (loop_vinfo); 805 loop->aux = NULL; 806 } 807 808 809 /* Function vect_analyze_loop_1. 810 811 Apply a set of analyses on LOOP, and create a loop_vec_info struct 812 for it. The different analyses will record information in the 813 loop_vec_info struct. This is a subset of the analyses applied in 814 vect_analyze_loop, to be applied on an inner-loop nested in the loop 815 that is now considered for (outer-loop) vectorization. */ 816 817 static loop_vec_info 818 vect_analyze_loop_1 (struct loop *loop) 819 { 820 loop_vec_info loop_vinfo; 821 822 if (vect_print_dump_info (REPORT_DETAILS)) 823 fprintf (vect_dump, "===== analyze_loop_nest_1 ====="); 824 825 /* Check the CFG characteristics of the loop (nesting, entry/exit, etc. */ 826 827 loop_vinfo = vect_analyze_loop_form (loop); 828 if (!loop_vinfo) 829 { 830 if (vect_print_dump_info (REPORT_DETAILS)) 831 fprintf (vect_dump, "bad inner-loop form."); 832 return NULL; 833 } 834 835 return loop_vinfo; 836 } 837 838 839 /* Function vect_analyze_loop_form. 840 841 Verify that certain CFG restrictions hold, including: 842 - the loop has a pre-header 843 - the loop has a single entry and exit 844 - the loop exit condition is simple enough, and the number of iterations 845 can be analyzed (a countable loop). */ 846 847 loop_vec_info 848 vect_analyze_loop_form (struct loop *loop) 849 { 850 loop_vec_info loop_vinfo; 851 gimple loop_cond; 852 tree number_of_iterations = NULL; 853 loop_vec_info inner_loop_vinfo = NULL; 854 855 if (vect_print_dump_info (REPORT_DETAILS)) 856 fprintf (vect_dump, "=== vect_analyze_loop_form ==="); 857 858 /* Different restrictions apply when we are considering an inner-most loop, 859 vs. an outer (nested) loop. 860 (FORNOW. May want to relax some of these restrictions in the future). */ 861 862 if (!loop->inner) 863 { 864 /* Inner-most loop. We currently require that the number of BBs is 865 exactly 2 (the header and latch). Vectorizable inner-most loops 866 look like this: 867 868 (pre-header) 869 | 870 header <--------+ 871 | | | 872 | +--> latch --+ 873 | 874 (exit-bb) */ 875 876 if (loop->num_nodes != 2) 877 { 878 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS)) 879 fprintf (vect_dump, "not vectorized: control flow in loop."); 880 return NULL; 881 } 882 883 if (empty_block_p (loop->header)) 884 { 885 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS)) 886 fprintf (vect_dump, "not vectorized: empty loop."); 887 return NULL; 888 } 889 } 890 else 891 { 892 struct loop *innerloop = loop->inner; 893 edge entryedge; 894 895 /* Nested loop. We currently require that the loop is doubly-nested, 896 contains a single inner loop, and the number of BBs is exactly 5. 897 Vectorizable outer-loops look like this: 898 899 (pre-header) 900 | 901 header <---+ 902 | | 903 inner-loop | 904 | | 905 tail ------+ 906 | 907 (exit-bb) 908 909 The inner-loop has the properties expected of inner-most loops 910 as described above. */ 911 912 if ((loop->inner)->inner || (loop->inner)->next) 913 { 914 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS)) 915 fprintf (vect_dump, "not vectorized: multiple nested loops."); 916 return NULL; 917 } 918 919 /* Analyze the inner-loop. */ 920 inner_loop_vinfo = vect_analyze_loop_1 (loop->inner); 921 if (!inner_loop_vinfo) 922 { 923 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS)) 924 fprintf (vect_dump, "not vectorized: Bad inner loop."); 925 return NULL; 926 } 927 928 if (!expr_invariant_in_loop_p (loop, 929 LOOP_VINFO_NITERS (inner_loop_vinfo))) 930 { 931 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS)) 932 fprintf (vect_dump, 933 "not vectorized: inner-loop count not invariant."); 934 destroy_loop_vec_info (inner_loop_vinfo, true); 935 return NULL; 936 } 937 938 if (loop->num_nodes != 5) 939 { 940 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS)) 941 fprintf (vect_dump, "not vectorized: control flow in loop."); 942 destroy_loop_vec_info (inner_loop_vinfo, true); 943 return NULL; 944 } 945 946 gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2); 947 entryedge = EDGE_PRED (innerloop->header, 0); 948 if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch) 949 entryedge = EDGE_PRED (innerloop->header, 1); 950 951 if (entryedge->src != loop->header 952 || !single_exit (innerloop) 953 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src) 954 { 955 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS)) 956 fprintf (vect_dump, "not vectorized: unsupported outerloop form."); 957 destroy_loop_vec_info (inner_loop_vinfo, true); 958 return NULL; 959 } 960 961 if (vect_print_dump_info (REPORT_DETAILS)) 962 fprintf (vect_dump, "Considering outer-loop vectorization."); 963 } 964 965 if (!single_exit (loop) 966 || EDGE_COUNT (loop->header->preds) != 2) 967 { 968 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS)) 969 { 970 if (!single_exit (loop)) 971 fprintf (vect_dump, "not vectorized: multiple exits."); 972 else if (EDGE_COUNT (loop->header->preds) != 2) 973 fprintf (vect_dump, "not vectorized: too many incoming edges."); 974 } 975 if (inner_loop_vinfo) 976 destroy_loop_vec_info (inner_loop_vinfo, true); 977 return NULL; 978 } 979 980 /* We assume that the loop exit condition is at the end of the loop. i.e, 981 that the loop is represented as a do-while (with a proper if-guard 982 before the loop if needed), where the loop header contains all the 983 executable statements, and the latch is empty. */ 984 if (!empty_block_p (loop->latch) 985 || !gimple_seq_empty_p (phi_nodes (loop->latch))) 986 { 987 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS)) 988 fprintf (vect_dump, "not vectorized: unexpected loop form."); 989 if (inner_loop_vinfo) 990 destroy_loop_vec_info (inner_loop_vinfo, true); 991 return NULL; 992 } 993 994 /* Make sure there exists a single-predecessor exit bb: */ 995 if (!single_pred_p (single_exit (loop)->dest)) 996 { 997 edge e = single_exit (loop); 998 if (!(e->flags & EDGE_ABNORMAL)) 999 { 1000 split_loop_exit_edge (e); 1001 if (vect_print_dump_info (REPORT_DETAILS)) 1002 fprintf (vect_dump, "split exit edge."); 1003 } 1004 else 1005 { 1006 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS)) 1007 fprintf (vect_dump, "not vectorized: abnormal loop exit edge."); 1008 if (inner_loop_vinfo) 1009 destroy_loop_vec_info (inner_loop_vinfo, true); 1010 return NULL; 1011 } 1012 } 1013 1014 loop_cond = vect_get_loop_niters (loop, &number_of_iterations); 1015 if (!loop_cond) 1016 { 1017 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS)) 1018 fprintf (vect_dump, "not vectorized: complicated exit condition."); 1019 if (inner_loop_vinfo) 1020 destroy_loop_vec_info (inner_loop_vinfo, true); 1021 return NULL; 1022 } 1023 1024 if (!number_of_iterations) 1025 { 1026 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS)) 1027 fprintf (vect_dump, 1028 "not vectorized: number of iterations cannot be computed."); 1029 if (inner_loop_vinfo) 1030 destroy_loop_vec_info (inner_loop_vinfo, true); 1031 return NULL; 1032 } 1033 1034 if (chrec_contains_undetermined (number_of_iterations)) 1035 { 1036 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS)) 1037 fprintf (vect_dump, "Infinite number of iterations."); 1038 if (inner_loop_vinfo) 1039 destroy_loop_vec_info (inner_loop_vinfo, true); 1040 return NULL; 1041 } 1042 1043 if (!NITERS_KNOWN_P (number_of_iterations)) 1044 { 1045 if (vect_print_dump_info (REPORT_DETAILS)) 1046 { 1047 fprintf (vect_dump, "Symbolic number of iterations is "); 1048 print_generic_expr (vect_dump, number_of_iterations, TDF_DETAILS); 1049 } 1050 } 1051 else if (TREE_INT_CST_LOW (number_of_iterations) == 0) 1052 { 1053 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS)) 1054 fprintf (vect_dump, "not vectorized: number of iterations = 0."); 1055 if (inner_loop_vinfo) 1056 destroy_loop_vec_info (inner_loop_vinfo, false); 1057 return NULL; 1058 } 1059 1060 loop_vinfo = new_loop_vec_info (loop); 1061 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations; 1062 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations; 1063 1064 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type; 1065 1066 /* CHECKME: May want to keep it around it in the future. */ 1067 if (inner_loop_vinfo) 1068 destroy_loop_vec_info (inner_loop_vinfo, false); 1069 1070 gcc_assert (!loop->aux); 1071 loop->aux = loop_vinfo; 1072 return loop_vinfo; 1073 } 1074 1075 1076 /* Function vect_analyze_loop_operations. 1077 1078 Scan the loop stmts and make sure they are all vectorizable. */ 1079 1080 static bool 1081 vect_analyze_loop_operations (loop_vec_info loop_vinfo) 1082 { 1083 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 1084 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); 1085 int nbbs = loop->num_nodes; 1086 gimple_stmt_iterator si; 1087 unsigned int vectorization_factor = 0; 1088 int i; 1089 gimple phi; 1090 stmt_vec_info stmt_info; 1091 bool need_to_vectorize = false; 1092 int min_profitable_iters; 1093 int min_scalar_loop_bound; 1094 unsigned int th; 1095 bool only_slp_in_loop = true, ok; 1096 1097 if (vect_print_dump_info (REPORT_DETAILS)) 1098 fprintf (vect_dump, "=== vect_analyze_loop_operations ==="); 1099 1100 gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo)); 1101 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 1102 1103 for (i = 0; i < nbbs; i++) 1104 { 1105 basic_block bb = bbs[i]; 1106 1107 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si)) 1108 { 1109 phi = gsi_stmt (si); 1110 ok = true; 1111 1112 stmt_info = vinfo_for_stmt (phi); 1113 if (vect_print_dump_info (REPORT_DETAILS)) 1114 { 1115 fprintf (vect_dump, "examining phi: "); 1116 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM); 1117 } 1118 1119 if (! is_loop_header_bb_p (bb)) 1120 { 1121 /* inner-loop loop-closed exit phi in outer-loop vectorization 1122 (i.e. a phi in the tail of the outer-loop). 1123 FORNOW: we currently don't support the case that these phis 1124 are not used in the outerloop (unless it is double reduction, 1125 i.e., this phi is vect_reduction_def), cause this case 1126 requires to actually do something here. */ 1127 if ((!STMT_VINFO_RELEVANT_P (stmt_info) 1128 || STMT_VINFO_LIVE_P (stmt_info)) 1129 && STMT_VINFO_DEF_TYPE (stmt_info) 1130 != vect_double_reduction_def) 1131 { 1132 if (vect_print_dump_info (REPORT_DETAILS)) 1133 fprintf (vect_dump, 1134 "Unsupported loop-closed phi in outer-loop."); 1135 return false; 1136 } 1137 continue; 1138 } 1139 1140 gcc_assert (stmt_info); 1141 1142 if (STMT_VINFO_LIVE_P (stmt_info)) 1143 { 1144 /* FORNOW: not yet supported. */ 1145 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS)) 1146 fprintf (vect_dump, "not vectorized: value used after loop."); 1147 return false; 1148 } 1149 1150 if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope 1151 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def) 1152 { 1153 /* A scalar-dependence cycle that we don't support. */ 1154 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS)) 1155 fprintf (vect_dump, "not vectorized: scalar dependence cycle."); 1156 return false; 1157 } 1158 1159 if (STMT_VINFO_RELEVANT_P (stmt_info)) 1160 { 1161 need_to_vectorize = true; 1162 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def) 1163 ok = vectorizable_induction (phi, NULL, NULL); 1164 } 1165 1166 if (!ok) 1167 { 1168 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS)) 1169 { 1170 fprintf (vect_dump, 1171 "not vectorized: relevant phi not supported: "); 1172 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM); 1173 } 1174 return false; 1175 } 1176 } 1177 1178 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si)) 1179 { 1180 gimple stmt = gsi_stmt (si); 1181 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 1182 1183 gcc_assert (stmt_info); 1184 1185 if (!vect_analyze_stmt (stmt, &need_to_vectorize, NULL)) 1186 return false; 1187 1188 if ((STMT_VINFO_RELEVANT_P (stmt_info) 1189 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info))) 1190 && !PURE_SLP_STMT (stmt_info)) 1191 1192 /* STMT needs both SLP and loop-based vectorization. */ 1193 only_slp_in_loop = false; 1194 } 1195 } /* bbs */ 1196 1197 /* All operations in the loop are either irrelevant (deal with loop 1198 control, or dead), or only used outside the loop and can be moved 1199 out of the loop (e.g. invariants, inductions). The loop can be 1200 optimized away by scalar optimizations. We're better off not 1201 touching this loop. */ 1202 if (!need_to_vectorize) 1203 { 1204 if (vect_print_dump_info (REPORT_DETAILS)) 1205 fprintf (vect_dump, 1206 "All the computation can be taken out of the loop."); 1207 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS)) 1208 fprintf (vect_dump, 1209 "not vectorized: redundant loop. no profit to vectorize."); 1210 return false; 1211 } 1212 1213 /* If all the stmts in the loop can be SLPed, we perform only SLP, and 1214 vectorization factor of the loop is the unrolling factor required by the 1215 SLP instances. If that unrolling factor is 1, we say, that we perform 1216 pure SLP on loop - cross iteration parallelism is not exploited. */ 1217 if (only_slp_in_loop) 1218 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo); 1219 else 1220 vectorization_factor = least_common_multiple (vectorization_factor, 1221 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo)); 1222 1223 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor; 1224 1225 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) 1226 && vect_print_dump_info (REPORT_DETAILS)) 1227 fprintf (vect_dump, 1228 "vectorization_factor = %d, niters = " HOST_WIDE_INT_PRINT_DEC, 1229 vectorization_factor, LOOP_VINFO_INT_NITERS (loop_vinfo)); 1230 1231 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) 1232 && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor)) 1233 { 1234 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS)) 1235 fprintf (vect_dump, "not vectorized: iteration count too small."); 1236 if (vect_print_dump_info (REPORT_DETAILS)) 1237 fprintf (vect_dump,"not vectorized: iteration count smaller than " 1238 "vectorization factor."); 1239 return false; 1240 } 1241 1242 /* Analyze cost. Decide if worth while to vectorize. */ 1243 1244 /* Once VF is set, SLP costs should be updated since the number of created 1245 vector stmts depends on VF. */ 1246 vect_update_slp_costs_according_to_vf (loop_vinfo); 1247 1248 min_profitable_iters = vect_estimate_min_profitable_iters (loop_vinfo); 1249 LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters; 1250 1251 if (min_profitable_iters < 0) 1252 { 1253 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS)) 1254 fprintf (vect_dump, "not vectorized: vectorization not profitable."); 1255 if (vect_print_dump_info (REPORT_DETAILS)) 1256 fprintf (vect_dump, "not vectorized: vector version will never be " 1257 "profitable."); 1258 return false; 1259 } 1260 1261 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND) 1262 * vectorization_factor) - 1); 1263 1264 /* Use the cost model only if it is more conservative than user specified 1265 threshold. */ 1266 1267 th = (unsigned) min_scalar_loop_bound; 1268 if (min_profitable_iters 1269 && (!min_scalar_loop_bound 1270 || min_profitable_iters > min_scalar_loop_bound)) 1271 th = (unsigned) min_profitable_iters; 1272 1273 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) 1274 && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th) 1275 { 1276 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS)) 1277 fprintf (vect_dump, "not vectorized: vectorization not " 1278 "profitable."); 1279 if (vect_print_dump_info (REPORT_DETAILS)) 1280 fprintf (vect_dump, "not vectorized: iteration count smaller than " 1281 "user specified loop bound parameter or minimum " 1282 "profitable iterations (whichever is more conservative)."); 1283 return false; 1284 } 1285 1286 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) 1287 || LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0 1288 || LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo)) 1289 { 1290 if (vect_print_dump_info (REPORT_DETAILS)) 1291 fprintf (vect_dump, "epilog loop required."); 1292 if (!vect_can_advance_ivs_p (loop_vinfo)) 1293 { 1294 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS)) 1295 fprintf (vect_dump, 1296 "not vectorized: can't create epilog loop 1."); 1297 return false; 1298 } 1299 if (!slpeel_can_duplicate_loop_p (loop, single_exit (loop))) 1300 { 1301 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS)) 1302 fprintf (vect_dump, 1303 "not vectorized: can't create epilog loop 2."); 1304 return false; 1305 } 1306 } 1307 1308 return true; 1309 } 1310 1311 1312 /* Function vect_analyze_loop. 1313 1314 Apply a set of analyses on LOOP, and create a loop_vec_info struct 1315 for it. The different analyses will record information in the 1316 loop_vec_info struct. */ 1317 loop_vec_info 1318 vect_analyze_loop (struct loop *loop) 1319 { 1320 bool ok; 1321 loop_vec_info loop_vinfo; 1322 1323 if (vect_print_dump_info (REPORT_DETAILS)) 1324 fprintf (vect_dump, "===== analyze_loop_nest ====="); 1325 1326 if (loop_outer (loop) 1327 && loop_vec_info_for_loop (loop_outer (loop)) 1328 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop)))) 1329 { 1330 if (vect_print_dump_info (REPORT_DETAILS)) 1331 fprintf (vect_dump, "outer-loop already vectorized."); 1332 return NULL; 1333 } 1334 1335 /* Check the CFG characteristics of the loop (nesting, entry/exit, etc. */ 1336 1337 loop_vinfo = vect_analyze_loop_form (loop); 1338 if (!loop_vinfo) 1339 { 1340 if (vect_print_dump_info (REPORT_DETAILS)) 1341 fprintf (vect_dump, "bad loop form."); 1342 return NULL; 1343 } 1344 1345 /* Find all data references in the loop (which correspond to vdefs/vuses) 1346 and analyze their evolution in the loop. 1347 1348 FORNOW: Handle only simple, array references, which 1349 alignment can be forced, and aligned pointer-references. */ 1350 1351 ok = vect_analyze_data_refs (loop_vinfo, NULL); 1352 if (!ok) 1353 { 1354 if (vect_print_dump_info (REPORT_DETAILS)) 1355 fprintf (vect_dump, "bad data references."); 1356 destroy_loop_vec_info (loop_vinfo, true); 1357 return NULL; 1358 } 1359 1360 /* Classify all cross-iteration scalar data-flow cycles. 1361 Cross-iteration cycles caused by virtual phis are analyzed separately. */ 1362 1363 vect_analyze_scalar_cycles (loop_vinfo); 1364 1365 vect_pattern_recog (loop_vinfo); 1366 1367 /* Data-flow analysis to detect stmts that do not need to be vectorized. */ 1368 1369 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo); 1370 if (!ok) 1371 { 1372 if (vect_print_dump_info (REPORT_DETAILS)) 1373 fprintf (vect_dump, "unexpected pattern."); 1374 destroy_loop_vec_info (loop_vinfo, true); 1375 return NULL; 1376 } 1377 1378 /* Analyze the alignment of the data-refs in the loop. 1379 Fail if a data reference is found that cannot be vectorized. */ 1380 1381 ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL); 1382 if (!ok) 1383 { 1384 if (vect_print_dump_info (REPORT_DETAILS)) 1385 fprintf (vect_dump, "bad data alignment."); 1386 destroy_loop_vec_info (loop_vinfo, true); 1387 return NULL; 1388 } 1389 1390 ok = vect_determine_vectorization_factor (loop_vinfo); 1391 if (!ok) 1392 { 1393 if (vect_print_dump_info (REPORT_DETAILS)) 1394 fprintf (vect_dump, "can't determine vectorization factor."); 1395 destroy_loop_vec_info (loop_vinfo, true); 1396 return NULL; 1397 } 1398 1399 /* Analyze data dependences between the data-refs in the loop. 1400 FORNOW: fail at the first data dependence that we encounter. */ 1401 1402 ok = vect_analyze_data_ref_dependences (loop_vinfo, NULL); 1403 if (!ok) 1404 { 1405 if (vect_print_dump_info (REPORT_DETAILS)) 1406 fprintf (vect_dump, "bad data dependence."); 1407 destroy_loop_vec_info (loop_vinfo, true); 1408 return NULL; 1409 } 1410 1411 /* Analyze the access patterns of the data-refs in the loop (consecutive, 1412 complex, etc.). FORNOW: Only handle consecutive access pattern. */ 1413 1414 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL); 1415 if (!ok) 1416 { 1417 if (vect_print_dump_info (REPORT_DETAILS)) 1418 fprintf (vect_dump, "bad data access."); 1419 destroy_loop_vec_info (loop_vinfo, true); 1420 return NULL; 1421 } 1422 1423 /* Prune the list of ddrs to be tested at run-time by versioning for alias. 1424 It is important to call pruning after vect_analyze_data_ref_accesses, 1425 since we use grouping information gathered by interleaving analysis. */ 1426 ok = vect_prune_runtime_alias_test_list (loop_vinfo); 1427 if (!ok) 1428 { 1429 if (vect_print_dump_info (REPORT_DETAILS)) 1430 fprintf (vect_dump, "too long list of versioning for alias " 1431 "run-time tests."); 1432 destroy_loop_vec_info (loop_vinfo, true); 1433 return NULL; 1434 } 1435 1436 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */ 1437 ok = vect_analyze_slp (loop_vinfo, NULL); 1438 if (ok) 1439 { 1440 /* Decide which possible SLP instances to SLP. */ 1441 vect_make_slp_decision (loop_vinfo); 1442 1443 /* Find stmts that need to be both vectorized and SLPed. */ 1444 vect_detect_hybrid_slp (loop_vinfo); 1445 } 1446 1447 /* This pass will decide on using loop versioning and/or loop peeling in 1448 order to enhance the alignment of data references in the loop. */ 1449 1450 ok = vect_enhance_data_refs_alignment (loop_vinfo); 1451 if (!ok) 1452 { 1453 if (vect_print_dump_info (REPORT_DETAILS)) 1454 fprintf (vect_dump, "bad data alignment."); 1455 destroy_loop_vec_info (loop_vinfo, true); 1456 return NULL; 1457 } 1458 1459 /* Scan all the operations in the loop and make sure they are 1460 vectorizable. */ 1461 1462 ok = vect_analyze_loop_operations (loop_vinfo); 1463 if (!ok) 1464 { 1465 if (vect_print_dump_info (REPORT_DETAILS)) 1466 fprintf (vect_dump, "bad operation or unsupported loop bound."); 1467 destroy_loop_vec_info (loop_vinfo, true); 1468 return NULL; 1469 } 1470 1471 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1; 1472 1473 return loop_vinfo; 1474 } 1475 1476 1477 /* Function reduction_code_for_scalar_code 1478 1479 Input: 1480 CODE - tree_code of a reduction operations. 1481 1482 Output: 1483 REDUC_CODE - the corresponding tree-code to be used to reduce the 1484 vector of partial results into a single scalar result (which 1485 will also reside in a vector) or ERROR_MARK if the operation is 1486 a supported reduction operation, but does not have such tree-code. 1487 1488 Return FALSE if CODE currently cannot be vectorized as reduction. */ 1489 1490 static bool 1491 reduction_code_for_scalar_code (enum tree_code code, 1492 enum tree_code *reduc_code) 1493 { 1494 switch (code) 1495 { 1496 case MAX_EXPR: 1497 *reduc_code = REDUC_MAX_EXPR; 1498 return true; 1499 1500 case MIN_EXPR: 1501 *reduc_code = REDUC_MIN_EXPR; 1502 return true; 1503 1504 case PLUS_EXPR: 1505 *reduc_code = REDUC_PLUS_EXPR; 1506 return true; 1507 1508 case MULT_EXPR: 1509 case MINUS_EXPR: 1510 case BIT_IOR_EXPR: 1511 case BIT_XOR_EXPR: 1512 case BIT_AND_EXPR: 1513 *reduc_code = ERROR_MARK; 1514 return true; 1515 1516 default: 1517 return false; 1518 } 1519 } 1520 1521 1522 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement 1523 STMT is printed with a message MSG. */ 1524 1525 static void 1526 report_vect_op (gimple stmt, const char *msg) 1527 { 1528 fprintf (vect_dump, "%s", msg); 1529 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); 1530 } 1531 1532 1533 /* Function vect_is_simple_reduction 1534 1535 (1) Detect a cross-iteration def-use cycle that represents a simple 1536 reduction computation. We look for the following pattern: 1537 1538 loop_header: 1539 a1 = phi < a0, a2 > 1540 a3 = ... 1541 a2 = operation (a3, a1) 1542 1543 such that: 1544 1. operation is commutative and associative and it is safe to 1545 change the order of the computation (if CHECK_REDUCTION is true) 1546 2. no uses for a2 in the loop (a2 is used out of the loop) 1547 3. no uses of a1 in the loop besides the reduction operation. 1548 1549 Condition 1 is tested here. 1550 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized. 1551 1552 (2) Detect a cross-iteration def-use cycle in nested loops, i.e., 1553 nested cycles, if CHECK_REDUCTION is false. 1554 1555 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double 1556 reductions: 1557 1558 a1 = phi < a0, a2 > 1559 inner loop (def of a3) 1560 a2 = phi < a3 > 1561 */ 1562 1563 gimple 1564 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi, 1565 bool check_reduction, bool *double_reduc) 1566 { 1567 struct loop *loop = (gimple_bb (phi))->loop_father; 1568 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info); 1569 edge latch_e = loop_latch_edge (loop); 1570 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e); 1571 gimple def_stmt, def1 = NULL, def2 = NULL; 1572 enum tree_code code; 1573 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE; 1574 tree type; 1575 int nloop_uses; 1576 tree name; 1577 imm_use_iterator imm_iter; 1578 use_operand_p use_p; 1579 bool phi_def; 1580 1581 *double_reduc = false; 1582 1583 /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization, 1584 otherwise, we assume outer loop vectorization. */ 1585 gcc_assert ((check_reduction && loop == vect_loop) 1586 || (!check_reduction && flow_loop_nested_p (vect_loop, loop))); 1587 1588 name = PHI_RESULT (phi); 1589 nloop_uses = 0; 1590 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name) 1591 { 1592 gimple use_stmt = USE_STMT (use_p); 1593 if (is_gimple_debug (use_stmt)) 1594 continue; 1595 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)) 1596 && vinfo_for_stmt (use_stmt) 1597 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt))) 1598 nloop_uses++; 1599 if (nloop_uses > 1) 1600 { 1601 if (vect_print_dump_info (REPORT_DETAILS)) 1602 fprintf (vect_dump, "reduction used in loop."); 1603 return NULL; 1604 } 1605 } 1606 1607 if (TREE_CODE (loop_arg) != SSA_NAME) 1608 { 1609 if (vect_print_dump_info (REPORT_DETAILS)) 1610 { 1611 fprintf (vect_dump, "reduction: not ssa_name: "); 1612 print_generic_expr (vect_dump, loop_arg, TDF_SLIM); 1613 } 1614 return NULL; 1615 } 1616 1617 def_stmt = SSA_NAME_DEF_STMT (loop_arg); 1618 if (!def_stmt) 1619 { 1620 if (vect_print_dump_info (REPORT_DETAILS)) 1621 fprintf (vect_dump, "reduction: no def_stmt."); 1622 return NULL; 1623 } 1624 1625 if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI) 1626 { 1627 if (vect_print_dump_info (REPORT_DETAILS)) 1628 print_gimple_stmt (vect_dump, def_stmt, 0, TDF_SLIM); 1629 return NULL; 1630 } 1631 1632 if (is_gimple_assign (def_stmt)) 1633 { 1634 name = gimple_assign_lhs (def_stmt); 1635 phi_def = false; 1636 } 1637 else 1638 { 1639 name = PHI_RESULT (def_stmt); 1640 phi_def = true; 1641 } 1642 1643 nloop_uses = 0; 1644 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name) 1645 { 1646 gimple use_stmt = USE_STMT (use_p); 1647 if (is_gimple_debug (use_stmt)) 1648 continue; 1649 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)) 1650 && vinfo_for_stmt (use_stmt) 1651 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt))) 1652 nloop_uses++; 1653 if (nloop_uses > 1) 1654 { 1655 if (vect_print_dump_info (REPORT_DETAILS)) 1656 fprintf (vect_dump, "reduction used in loop."); 1657 return NULL; 1658 } 1659 } 1660 1661 /* If DEF_STMT is a phi node itself, we expect it to have a single argument 1662 defined in the inner loop. */ 1663 if (phi_def) 1664 { 1665 op1 = PHI_ARG_DEF (def_stmt, 0); 1666 1667 if (gimple_phi_num_args (def_stmt) != 1 1668 || TREE_CODE (op1) != SSA_NAME) 1669 { 1670 if (vect_print_dump_info (REPORT_DETAILS)) 1671 fprintf (vect_dump, "unsupported phi node definition."); 1672 1673 return NULL; 1674 } 1675 1676 def1 = SSA_NAME_DEF_STMT (op1); 1677 if (flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)) 1678 && loop->inner 1679 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1)) 1680 && is_gimple_assign (def1)) 1681 { 1682 if (vect_print_dump_info (REPORT_DETAILS)) 1683 report_vect_op (def_stmt, "detected double reduction: "); 1684 1685 *double_reduc = true; 1686 return def_stmt; 1687 } 1688 1689 return NULL; 1690 } 1691 1692 code = gimple_assign_rhs_code (def_stmt); 1693 1694 if (check_reduction 1695 && (!commutative_tree_code (code) || !associative_tree_code (code))) 1696 { 1697 if (vect_print_dump_info (REPORT_DETAILS)) 1698 report_vect_op (def_stmt, "reduction: not commutative/associative: "); 1699 return NULL; 1700 } 1701 1702 if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS) 1703 { 1704 if (code != COND_EXPR) 1705 { 1706 if (vect_print_dump_info (REPORT_DETAILS)) 1707 report_vect_op (def_stmt, "reduction: not binary operation: "); 1708 1709 return NULL; 1710 } 1711 1712 op3 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0); 1713 if (COMPARISON_CLASS_P (op3)) 1714 { 1715 op4 = TREE_OPERAND (op3, 1); 1716 op3 = TREE_OPERAND (op3, 0); 1717 } 1718 1719 op1 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 1); 1720 op2 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 2); 1721 1722 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME) 1723 { 1724 if (vect_print_dump_info (REPORT_DETAILS)) 1725 report_vect_op (def_stmt, "reduction: uses not ssa_names: "); 1726 1727 return NULL; 1728 } 1729 } 1730 else 1731 { 1732 op1 = gimple_assign_rhs1 (def_stmt); 1733 op2 = gimple_assign_rhs2 (def_stmt); 1734 1735 if (TREE_CODE (op1) != SSA_NAME || TREE_CODE (op2) != SSA_NAME) 1736 { 1737 if (vect_print_dump_info (REPORT_DETAILS)) 1738 report_vect_op (def_stmt, "reduction: uses not ssa_names: "); 1739 1740 return NULL; 1741 } 1742 } 1743 1744 type = TREE_TYPE (gimple_assign_lhs (def_stmt)); 1745 if ((TREE_CODE (op1) == SSA_NAME 1746 && !types_compatible_p (type,TREE_TYPE (op1))) 1747 || (TREE_CODE (op2) == SSA_NAME 1748 && !types_compatible_p (type, TREE_TYPE (op2))) 1749 || (op3 && TREE_CODE (op3) == SSA_NAME 1750 && !types_compatible_p (type, TREE_TYPE (op3))) 1751 || (op4 && TREE_CODE (op4) == SSA_NAME 1752 && !types_compatible_p (type, TREE_TYPE (op4)))) 1753 { 1754 if (vect_print_dump_info (REPORT_DETAILS)) 1755 { 1756 fprintf (vect_dump, "reduction: multiple types: operation type: "); 1757 print_generic_expr (vect_dump, type, TDF_SLIM); 1758 fprintf (vect_dump, ", operands types: "); 1759 print_generic_expr (vect_dump, TREE_TYPE (op1), TDF_SLIM); 1760 fprintf (vect_dump, ","); 1761 print_generic_expr (vect_dump, TREE_TYPE (op2), TDF_SLIM); 1762 if (op3) 1763 { 1764 fprintf (vect_dump, ","); 1765 print_generic_expr (vect_dump, TREE_TYPE (op3), TDF_SLIM); 1766 } 1767 1768 if (op4) 1769 { 1770 fprintf (vect_dump, ","); 1771 print_generic_expr (vect_dump, TREE_TYPE (op4), TDF_SLIM); 1772 } 1773 } 1774 1775 return NULL; 1776 } 1777 1778 /* Check that it's ok to change the order of the computation. 1779 Generally, when vectorizing a reduction we change the order of the 1780 computation. This may change the behavior of the program in some 1781 cases, so we need to check that this is ok. One exception is when 1782 vectorizing an outer-loop: the inner-loop is executed sequentially, 1783 and therefore vectorizing reductions in the inner-loop during 1784 outer-loop vectorization is safe. */ 1785 1786 /* CHECKME: check for !flag_finite_math_only too? */ 1787 if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math 1788 && check_reduction) 1789 { 1790 /* Changing the order of operations changes the semantics. */ 1791 if (vect_print_dump_info (REPORT_DETAILS)) 1792 report_vect_op (def_stmt, "reduction: unsafe fp math optimization: "); 1793 return NULL; 1794 } 1795 else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type) 1796 && check_reduction) 1797 { 1798 /* Changing the order of operations changes the semantics. */ 1799 if (vect_print_dump_info (REPORT_DETAILS)) 1800 report_vect_op (def_stmt, "reduction: unsafe int math optimization: "); 1801 return NULL; 1802 } 1803 else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction) 1804 { 1805 /* Changing the order of operations changes the semantics. */ 1806 if (vect_print_dump_info (REPORT_DETAILS)) 1807 report_vect_op (def_stmt, 1808 "reduction: unsafe fixed-point math optimization: "); 1809 return NULL; 1810 } 1811 1812 /* Reduction is safe. We're dealing with one of the following: 1813 1) integer arithmetic and no trapv 1814 2) floating point arithmetic, and special flags permit this optimization 1815 3) nested cycle (i.e., outer loop vectorization). */ 1816 if (TREE_CODE (op1) == SSA_NAME) 1817 def1 = SSA_NAME_DEF_STMT (op1); 1818 1819 if (TREE_CODE (op2) == SSA_NAME) 1820 def2 = SSA_NAME_DEF_STMT (op2); 1821 1822 if (code != COND_EXPR 1823 && (!def1 || !def2 || gimple_nop_p (def1) || gimple_nop_p (def2))) 1824 { 1825 if (vect_print_dump_info (REPORT_DETAILS)) 1826 report_vect_op (def_stmt, "reduction: no defs for operands: "); 1827 return NULL; 1828 } 1829 1830 /* Check that one def is the reduction def, defined by PHI, 1831 the other def is either defined in the loop ("vect_internal_def"), 1832 or it's an induction (defined by a loop-header phi-node). */ 1833 1834 if (def2 && def2 == phi 1835 && (code == COND_EXPR 1836 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1)) 1837 && (is_gimple_assign (def1) 1838 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1)) 1839 == vect_induction_def 1840 || (gimple_code (def1) == GIMPLE_PHI 1841 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1)) 1842 == vect_internal_def 1843 && !is_loop_header_bb_p (gimple_bb (def1))))))) 1844 { 1845 if (vect_print_dump_info (REPORT_DETAILS)) 1846 report_vect_op (def_stmt, "detected reduction: "); 1847 return def_stmt; 1848 } 1849 else if (def1 && def1 == phi 1850 && (code == COND_EXPR 1851 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2)) 1852 && (is_gimple_assign (def2) 1853 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2)) 1854 == vect_induction_def 1855 || (gimple_code (def2) == GIMPLE_PHI 1856 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2)) 1857 == vect_internal_def 1858 && !is_loop_header_bb_p (gimple_bb (def2))))))) 1859 { 1860 if (check_reduction) 1861 { 1862 /* Swap operands (just for simplicity - so that the rest of the code 1863 can assume that the reduction variable is always the last (second) 1864 argument). */ 1865 if (vect_print_dump_info (REPORT_DETAILS)) 1866 report_vect_op (def_stmt, 1867 "detected reduction: need to swap operands: "); 1868 1869 swap_tree_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt), 1870 gimple_assign_rhs2_ptr (def_stmt)); 1871 } 1872 else 1873 { 1874 if (vect_print_dump_info (REPORT_DETAILS)) 1875 report_vect_op (def_stmt, "detected reduction: "); 1876 } 1877 1878 return def_stmt; 1879 } 1880 else 1881 { 1882 if (vect_print_dump_info (REPORT_DETAILS)) 1883 report_vect_op (def_stmt, "reduction: unknown pattern: "); 1884 1885 return NULL; 1886 } 1887 } 1888 1889 1890 /* Function vect_estimate_min_profitable_iters 1891 1892 Return the number of iterations required for the vector version of the 1893 loop to be profitable relative to the cost of the scalar version of the 1894 loop. 1895 1896 TODO: Take profile info into account before making vectorization 1897 decisions, if available. */ 1898 1899 int 1900 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo) 1901 { 1902 int i; 1903 int min_profitable_iters; 1904 int peel_iters_prologue; 1905 int peel_iters_epilogue; 1906 int vec_inside_cost = 0; 1907 int vec_outside_cost = 0; 1908 int scalar_single_iter_cost = 0; 1909 int scalar_outside_cost = 0; 1910 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 1911 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 1912 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); 1913 int nbbs = loop->num_nodes; 1914 int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo); 1915 int peel_guard_costs = 0; 1916 int innerloop_iters = 0, factor; 1917 VEC (slp_instance, heap) *slp_instances; 1918 slp_instance instance; 1919 1920 /* Cost model disabled. */ 1921 if (!flag_vect_cost_model) 1922 { 1923 if (vect_print_dump_info (REPORT_COST)) 1924 fprintf (vect_dump, "cost model disabled."); 1925 return 0; 1926 } 1927 1928 /* Requires loop versioning tests to handle misalignment. */ 1929 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)) 1930 { 1931 /* FIXME: Make cost depend on complexity of individual check. */ 1932 vec_outside_cost += 1933 VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)); 1934 if (vect_print_dump_info (REPORT_COST)) 1935 fprintf (vect_dump, "cost model: Adding cost of checks for loop " 1936 "versioning to treat misalignment.\n"); 1937 } 1938 1939 /* Requires loop versioning with alias checks. */ 1940 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)) 1941 { 1942 /* FIXME: Make cost depend on complexity of individual check. */ 1943 vec_outside_cost += 1944 VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)); 1945 if (vect_print_dump_info (REPORT_COST)) 1946 fprintf (vect_dump, "cost model: Adding cost of checks for loop " 1947 "versioning aliasing.\n"); 1948 } 1949 1950 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo) 1951 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)) 1952 vec_outside_cost += TARG_COND_TAKEN_BRANCH_COST; 1953 1954 /* Count statements in scalar loop. Using this as scalar cost for a single 1955 iteration for now. 1956 1957 TODO: Add outer loop support. 1958 1959 TODO: Consider assigning different costs to different scalar 1960 statements. */ 1961 1962 /* FORNOW. */ 1963 if (loop->inner) 1964 innerloop_iters = 50; /* FIXME */ 1965 1966 for (i = 0; i < nbbs; i++) 1967 { 1968 gimple_stmt_iterator si; 1969 basic_block bb = bbs[i]; 1970 1971 if (bb->loop_father == loop->inner) 1972 factor = innerloop_iters; 1973 else 1974 factor = 1; 1975 1976 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si)) 1977 { 1978 gimple stmt = gsi_stmt (si); 1979 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 1980 /* Skip stmts that are not vectorized inside the loop. */ 1981 if (!STMT_VINFO_RELEVANT_P (stmt_info) 1982 && (!STMT_VINFO_LIVE_P (stmt_info) 1983 || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)) 1984 continue; 1985 scalar_single_iter_cost += cost_for_stmt (stmt) * factor; 1986 vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor; 1987 /* FIXME: for stmts in the inner-loop in outer-loop vectorization, 1988 some of the "outside" costs are generated inside the outer-loop. */ 1989 vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info); 1990 } 1991 } 1992 1993 /* Add additional cost for the peeled instructions in prologue and epilogue 1994 loop. 1995 1996 FORNOW: If we don't know the value of peel_iters for prologue or epilogue 1997 at compile-time - we assume it's vf/2 (the worst would be vf-1). 1998 1999 TODO: Build an expression that represents peel_iters for prologue and 2000 epilogue to be used in a run-time test. */ 2001 2002 if (byte_misalign < 0) 2003 { 2004 peel_iters_prologue = vf/2; 2005 if (vect_print_dump_info (REPORT_COST)) 2006 fprintf (vect_dump, "cost model: " 2007 "prologue peel iters set to vf/2."); 2008 2009 /* If peeling for alignment is unknown, loop bound of main loop becomes 2010 unknown. */ 2011 peel_iters_epilogue = vf/2; 2012 if (vect_print_dump_info (REPORT_COST)) 2013 fprintf (vect_dump, "cost model: " 2014 "epilogue peel iters set to vf/2 because " 2015 "peeling for alignment is unknown ."); 2016 2017 /* If peeled iterations are unknown, count a taken branch and a not taken 2018 branch per peeled loop. Even if scalar loop iterations are known, 2019 vector iterations are not known since peeled prologue iterations are 2020 not known. Hence guards remain the same. */ 2021 peel_guard_costs += 2 * (TARG_COND_TAKEN_BRANCH_COST 2022 + TARG_COND_NOT_TAKEN_BRANCH_COST); 2023 } 2024 else 2025 { 2026 if (byte_misalign) 2027 { 2028 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo); 2029 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr)))); 2030 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))); 2031 int nelements = TYPE_VECTOR_SUBPARTS (vectype); 2032 2033 peel_iters_prologue = nelements - (byte_misalign / element_size); 2034 } 2035 else 2036 peel_iters_prologue = 0; 2037 2038 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) 2039 { 2040 peel_iters_epilogue = vf/2; 2041 if (vect_print_dump_info (REPORT_COST)) 2042 fprintf (vect_dump, "cost model: " 2043 "epilogue peel iters set to vf/2 because " 2044 "loop iterations are unknown ."); 2045 2046 /* If peeled iterations are known but number of scalar loop 2047 iterations are unknown, count a taken branch per peeled loop. */ 2048 peel_guard_costs += 2 * TARG_COND_TAKEN_BRANCH_COST; 2049 2050 } 2051 else 2052 { 2053 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo); 2054 peel_iters_prologue = niters < peel_iters_prologue ? 2055 niters : peel_iters_prologue; 2056 peel_iters_epilogue = (niters - peel_iters_prologue) % vf; 2057 /* If we need to peel for gaps, but no peeling is required, we have 2058 to peel VF iterations. */ 2059 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue) 2060 peel_iters_epilogue = vf; 2061 } 2062 } 2063 2064 vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost) 2065 + (peel_iters_epilogue * scalar_single_iter_cost) 2066 + peel_guard_costs; 2067 2068 /* FORNOW: The scalar outside cost is incremented in one of the 2069 following ways: 2070 2071 1. The vectorizer checks for alignment and aliasing and generates 2072 a condition that allows dynamic vectorization. A cost model 2073 check is ANDED with the versioning condition. Hence scalar code 2074 path now has the added cost of the versioning check. 2075 2076 if (cost > th & versioning_check) 2077 jmp to vector code 2078 2079 Hence run-time scalar is incremented by not-taken branch cost. 2080 2081 2. The vectorizer then checks if a prologue is required. If the 2082 cost model check was not done before during versioning, it has to 2083 be done before the prologue check. 2084 2085 if (cost <= th) 2086 prologue = scalar_iters 2087 if (prologue == 0) 2088 jmp to vector code 2089 else 2090 execute prologue 2091 if (prologue == num_iters) 2092 go to exit 2093 2094 Hence the run-time scalar cost is incremented by a taken branch, 2095 plus a not-taken branch, plus a taken branch cost. 2096 2097 3. The vectorizer then checks if an epilogue is required. If the 2098 cost model check was not done before during prologue check, it 2099 has to be done with the epilogue check. 2100 2101 if (prologue == 0) 2102 jmp to vector code 2103 else 2104 execute prologue 2105 if (prologue == num_iters) 2106 go to exit 2107 vector code: 2108 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0)) 2109 jmp to epilogue 2110 2111 Hence the run-time scalar cost should be incremented by 2 taken 2112 branches. 2113 2114 TODO: The back end may reorder the BBS's differently and reverse 2115 conditions/branch directions. Change the estimates below to 2116 something more reasonable. */ 2117 2118 /* If the number of iterations is known and we do not do versioning, we can 2119 decide whether to vectorize at compile time. Hence the scalar version 2120 do not carry cost model guard costs. */ 2121 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) 2122 || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo) 2123 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)) 2124 { 2125 /* Cost model check occurs at versioning. */ 2126 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo) 2127 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)) 2128 scalar_outside_cost += TARG_COND_NOT_TAKEN_BRANCH_COST; 2129 else 2130 { 2131 /* Cost model check occurs at prologue generation. */ 2132 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0) 2133 scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST 2134 + TARG_COND_NOT_TAKEN_BRANCH_COST; 2135 /* Cost model check occurs at epilogue generation. */ 2136 else 2137 scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST; 2138 } 2139 } 2140 2141 /* Add SLP costs. */ 2142 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo); 2143 for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++) 2144 { 2145 vec_outside_cost += SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (instance); 2146 vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance); 2147 } 2148 2149 /* Calculate number of iterations required to make the vector version 2150 profitable, relative to the loop bodies only. The following condition 2151 must hold true: 2152 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC 2153 where 2154 SIC = scalar iteration cost, VIC = vector iteration cost, 2155 VOC = vector outside cost, VF = vectorization factor, 2156 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations 2157 SOC = scalar outside cost for run time cost model check. */ 2158 2159 if ((scalar_single_iter_cost * vf) > vec_inside_cost) 2160 { 2161 if (vec_outside_cost <= 0) 2162 min_profitable_iters = 1; 2163 else 2164 { 2165 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf 2166 - vec_inside_cost * peel_iters_prologue 2167 - vec_inside_cost * peel_iters_epilogue) 2168 / ((scalar_single_iter_cost * vf) 2169 - vec_inside_cost); 2170 2171 if ((scalar_single_iter_cost * vf * min_profitable_iters) 2172 <= ((vec_inside_cost * min_profitable_iters) 2173 + ((vec_outside_cost - scalar_outside_cost) * vf))) 2174 min_profitable_iters++; 2175 } 2176 } 2177 /* vector version will never be profitable. */ 2178 else 2179 { 2180 if (vect_print_dump_info (REPORT_COST)) 2181 fprintf (vect_dump, "cost model: the vector iteration cost = %d " 2182 "divided by the scalar iteration cost = %d " 2183 "is greater or equal to the vectorization factor = %d.", 2184 vec_inside_cost, scalar_single_iter_cost, vf); 2185 return -1; 2186 } 2187 2188 if (vect_print_dump_info (REPORT_COST)) 2189 { 2190 fprintf (vect_dump, "Cost model analysis: \n"); 2191 fprintf (vect_dump, " Vector inside of loop cost: %d\n", 2192 vec_inside_cost); 2193 fprintf (vect_dump, " Vector outside of loop cost: %d\n", 2194 vec_outside_cost); 2195 fprintf (vect_dump, " Scalar iteration cost: %d\n", 2196 scalar_single_iter_cost); 2197 fprintf (vect_dump, " Scalar outside cost: %d\n", scalar_outside_cost); 2198 fprintf (vect_dump, " prologue iterations: %d\n", 2199 peel_iters_prologue); 2200 fprintf (vect_dump, " epilogue iterations: %d\n", 2201 peel_iters_epilogue); 2202 fprintf (vect_dump, " Calculated minimum iters for profitability: %d\n", 2203 min_profitable_iters); 2204 } 2205 2206 min_profitable_iters = 2207 min_profitable_iters < vf ? vf : min_profitable_iters; 2208 2209 /* Because the condition we create is: 2210 if (niters <= min_profitable_iters) 2211 then skip the vectorized loop. */ 2212 min_profitable_iters--; 2213 2214 if (vect_print_dump_info (REPORT_COST)) 2215 fprintf (vect_dump, " Profitability threshold = %d\n", 2216 min_profitable_iters); 2217 2218 return min_profitable_iters; 2219 } 2220 2221 2222 /* TODO: Close dependency between vect_model_*_cost and vectorizable_* 2223 functions. Design better to avoid maintenance issues. */ 2224 2225 /* Function vect_model_reduction_cost. 2226 2227 Models cost for a reduction operation, including the vector ops 2228 generated within the strip-mine loop, the initial definition before 2229 the loop, and the epilogue code that must be generated. */ 2230 2231 static bool 2232 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code, 2233 int ncopies) 2234 { 2235 int outer_cost = 0; 2236 enum tree_code code; 2237 optab optab; 2238 tree vectype; 2239 gimple stmt, orig_stmt; 2240 tree reduction_op; 2241 enum machine_mode mode; 2242 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 2243 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 2244 2245 2246 /* Cost of reduction op inside loop. */ 2247 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) += ncopies * TARG_VEC_STMT_COST; 2248 2249 stmt = STMT_VINFO_STMT (stmt_info); 2250 2251 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt))) 2252 { 2253 case GIMPLE_SINGLE_RHS: 2254 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op); 2255 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2); 2256 break; 2257 case GIMPLE_UNARY_RHS: 2258 reduction_op = gimple_assign_rhs1 (stmt); 2259 break; 2260 case GIMPLE_BINARY_RHS: 2261 reduction_op = gimple_assign_rhs2 (stmt); 2262 break; 2263 default: 2264 gcc_unreachable (); 2265 } 2266 2267 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op)); 2268 if (!vectype) 2269 { 2270 if (vect_print_dump_info (REPORT_COST)) 2271 { 2272 fprintf (vect_dump, "unsupported data-type "); 2273 print_generic_expr (vect_dump, TREE_TYPE (reduction_op), TDF_SLIM); 2274 } 2275 return false; 2276 } 2277 2278 mode = TYPE_MODE (vectype); 2279 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info); 2280 2281 if (!orig_stmt) 2282 orig_stmt = STMT_VINFO_STMT (stmt_info); 2283 2284 code = gimple_assign_rhs_code (orig_stmt); 2285 2286 /* Add in cost for initial definition. */ 2287 outer_cost += TARG_SCALAR_TO_VEC_COST; 2288 2289 /* Determine cost of epilogue code. 2290 2291 We have a reduction operator that will reduce the vector in one statement. 2292 Also requires scalar extract. */ 2293 2294 if (!nested_in_vect_loop_p (loop, orig_stmt)) 2295 { 2296 if (reduc_code != ERROR_MARK) 2297 outer_cost += TARG_VEC_STMT_COST + TARG_VEC_TO_SCALAR_COST; 2298 else 2299 { 2300 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1); 2301 tree bitsize = 2302 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt))); 2303 int element_bitsize = tree_low_cst (bitsize, 1); 2304 int nelements = vec_size_in_bits / element_bitsize; 2305 2306 optab = optab_for_tree_code (code, vectype, optab_default); 2307 2308 /* We have a whole vector shift available. */ 2309 if (VECTOR_MODE_P (mode) 2310 && optab_handler (optab, mode)->insn_code != CODE_FOR_nothing 2311 && optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing) 2312 /* Final reduction via vector shifts and the reduction operator. Also 2313 requires scalar extract. */ 2314 outer_cost += ((exact_log2(nelements) * 2) * TARG_VEC_STMT_COST 2315 + TARG_VEC_TO_SCALAR_COST); 2316 else 2317 /* Use extracts and reduction op for final reduction. For N elements, 2318 we have N extracts and N-1 reduction ops. */ 2319 outer_cost += ((nelements + nelements - 1) * TARG_VEC_STMT_COST); 2320 } 2321 } 2322 2323 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost; 2324 2325 if (vect_print_dump_info (REPORT_COST)) 2326 fprintf (vect_dump, "vect_model_reduction_cost: inside_cost = %d, " 2327 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info), 2328 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info)); 2329 2330 return true; 2331 } 2332 2333 2334 /* Function vect_model_induction_cost. 2335 2336 Models cost for induction operations. */ 2337 2338 static void 2339 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies) 2340 { 2341 /* loop cost for vec_loop. */ 2342 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) = ncopies * TARG_VEC_STMT_COST; 2343 /* prologue cost for vec_init and vec_step. */ 2344 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = 2 * TARG_SCALAR_TO_VEC_COST; 2345 2346 if (vect_print_dump_info (REPORT_COST)) 2347 fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, " 2348 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info), 2349 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info)); 2350 } 2351 2352 2353 /* Function get_initial_def_for_induction 2354 2355 Input: 2356 STMT - a stmt that performs an induction operation in the loop. 2357 IV_PHI - the initial value of the induction variable 2358 2359 Output: 2360 Return a vector variable, initialized with the first VF values of 2361 the induction variable. E.g., for an iv with IV_PHI='X' and 2362 evolution S, for a vector of 4 units, we want to return: 2363 [X, X + S, X + 2*S, X + 3*S]. */ 2364 2365 static tree 2366 get_initial_def_for_induction (gimple iv_phi) 2367 { 2368 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi); 2369 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo); 2370 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 2371 tree scalar_type; 2372 tree vectype; 2373 int nunits; 2374 edge pe = loop_preheader_edge (loop); 2375 struct loop *iv_loop; 2376 basic_block new_bb; 2377 tree vec, vec_init, vec_step, t; 2378 tree access_fn; 2379 tree new_var; 2380 tree new_name; 2381 gimple init_stmt, induction_phi, new_stmt; 2382 tree induc_def, vec_def, vec_dest; 2383 tree init_expr, step_expr; 2384 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 2385 int i; 2386 bool ok; 2387 int ncopies; 2388 tree expr; 2389 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi); 2390 bool nested_in_vect_loop = false; 2391 gimple_seq stmts = NULL; 2392 imm_use_iterator imm_iter; 2393 use_operand_p use_p; 2394 gimple exit_phi; 2395 edge latch_e; 2396 tree loop_arg; 2397 gimple_stmt_iterator si; 2398 basic_block bb = gimple_bb (iv_phi); 2399 tree stepvectype; 2400 tree resvectype; 2401 2402 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */ 2403 if (nested_in_vect_loop_p (loop, iv_phi)) 2404 { 2405 nested_in_vect_loop = true; 2406 iv_loop = loop->inner; 2407 } 2408 else 2409 iv_loop = loop; 2410 gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father); 2411 2412 latch_e = loop_latch_edge (iv_loop); 2413 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e); 2414 2415 access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi)); 2416 gcc_assert (access_fn); 2417 STRIP_NOPS (access_fn); 2418 ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn, 2419 &init_expr, &step_expr); 2420 gcc_assert (ok); 2421 pe = loop_preheader_edge (iv_loop); 2422 2423 scalar_type = TREE_TYPE (init_expr); 2424 vectype = get_vectype_for_scalar_type (scalar_type); 2425 resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi))); 2426 gcc_assert (vectype); 2427 nunits = TYPE_VECTOR_SUBPARTS (vectype); 2428 ncopies = vf / nunits; 2429 2430 gcc_assert (phi_info); 2431 gcc_assert (ncopies >= 1); 2432 2433 /* Find the first insertion point in the BB. */ 2434 si = gsi_after_labels (bb); 2435 2436 /* Create the vector that holds the initial_value of the induction. */ 2437 if (nested_in_vect_loop) 2438 { 2439 /* iv_loop is nested in the loop to be vectorized. init_expr had already 2440 been created during vectorization of previous stmts; We obtain it from 2441 the STMT_VINFO_VEC_STMT of the defining stmt. */ 2442 tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi, 2443 loop_preheader_edge (iv_loop)); 2444 vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL); 2445 } 2446 else 2447 { 2448 /* iv_loop is the loop to be vectorized. Create: 2449 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */ 2450 new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_"); 2451 add_referenced_var (new_var); 2452 2453 new_name = force_gimple_operand (init_expr, &stmts, false, new_var); 2454 if (stmts) 2455 { 2456 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); 2457 gcc_assert (!new_bb); 2458 } 2459 2460 t = NULL_TREE; 2461 t = tree_cons (NULL_TREE, new_name, t); 2462 for (i = 1; i < nunits; i++) 2463 { 2464 /* Create: new_name_i = new_name + step_expr */ 2465 enum tree_code code = POINTER_TYPE_P (scalar_type) 2466 ? POINTER_PLUS_EXPR : PLUS_EXPR; 2467 init_stmt = gimple_build_assign_with_ops (code, new_var, 2468 new_name, step_expr); 2469 new_name = make_ssa_name (new_var, init_stmt); 2470 gimple_assign_set_lhs (init_stmt, new_name); 2471 2472 new_bb = gsi_insert_on_edge_immediate (pe, init_stmt); 2473 gcc_assert (!new_bb); 2474 2475 if (vect_print_dump_info (REPORT_DETAILS)) 2476 { 2477 fprintf (vect_dump, "created new init_stmt: "); 2478 print_gimple_stmt (vect_dump, init_stmt, 0, TDF_SLIM); 2479 } 2480 t = tree_cons (NULL_TREE, new_name, t); 2481 } 2482 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */ 2483 vec = build_constructor_from_list (vectype, nreverse (t)); 2484 vec_init = vect_init_vector (iv_phi, vec, vectype, NULL); 2485 } 2486 2487 2488 /* Create the vector that holds the step of the induction. */ 2489 if (nested_in_vect_loop) 2490 /* iv_loop is nested in the loop to be vectorized. Generate: 2491 vec_step = [S, S, S, S] */ 2492 new_name = step_expr; 2493 else 2494 { 2495 /* iv_loop is the loop to be vectorized. Generate: 2496 vec_step = [VF*S, VF*S, VF*S, VF*S] */ 2497 expr = build_int_cst (TREE_TYPE (step_expr), vf); 2498 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr), 2499 expr, step_expr); 2500 } 2501 2502 t = NULL_TREE; 2503 for (i = 0; i < nunits; i++) 2504 t = tree_cons (NULL_TREE, unshare_expr (new_name), t); 2505 gcc_assert (CONSTANT_CLASS_P (new_name)); 2506 stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name)); 2507 gcc_assert (stepvectype); 2508 vec = build_vector (stepvectype, t); 2509 vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL); 2510 2511 2512 /* Create the following def-use cycle: 2513 loop prolog: 2514 vec_init = ... 2515 vec_step = ... 2516 loop: 2517 vec_iv = PHI <vec_init, vec_loop> 2518 ... 2519 STMT 2520 ... 2521 vec_loop = vec_iv + vec_step; */ 2522 2523 /* Create the induction-phi that defines the induction-operand. */ 2524 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_"); 2525 add_referenced_var (vec_dest); 2526 induction_phi = create_phi_node (vec_dest, iv_loop->header); 2527 set_vinfo_for_stmt (induction_phi, 2528 new_stmt_vec_info (induction_phi, loop_vinfo, NULL)); 2529 induc_def = PHI_RESULT (induction_phi); 2530 2531 /* Create the iv update inside the loop */ 2532 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest, 2533 induc_def, vec_step); 2534 vec_def = make_ssa_name (vec_dest, new_stmt); 2535 gimple_assign_set_lhs (new_stmt, vec_def); 2536 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT); 2537 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo, 2538 NULL)); 2539 2540 /* Set the arguments of the phi node: */ 2541 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION); 2542 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop), 2543 UNKNOWN_LOCATION); 2544 2545 2546 /* In case that vectorization factor (VF) is bigger than the number 2547 of elements that we can fit in a vectype (nunits), we have to generate 2548 more than one vector stmt - i.e - we need to "unroll" the 2549 vector stmt by a factor VF/nunits. For more details see documentation 2550 in vectorizable_operation. */ 2551 2552 if (ncopies > 1) 2553 { 2554 stmt_vec_info prev_stmt_vinfo; 2555 /* FORNOW. This restriction should be relaxed. */ 2556 gcc_assert (!nested_in_vect_loop); 2557 2558 /* Create the vector that holds the step of the induction. */ 2559 expr = build_int_cst (TREE_TYPE (step_expr), nunits); 2560 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr), 2561 expr, step_expr); 2562 t = NULL_TREE; 2563 for (i = 0; i < nunits; i++) 2564 t = tree_cons (NULL_TREE, unshare_expr (new_name), t); 2565 gcc_assert (CONSTANT_CLASS_P (new_name)); 2566 vec = build_vector (stepvectype, t); 2567 vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL); 2568 2569 vec_def = induc_def; 2570 prev_stmt_vinfo = vinfo_for_stmt (induction_phi); 2571 for (i = 1; i < ncopies; i++) 2572 { 2573 /* vec_i = vec_prev + vec_step */ 2574 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest, 2575 vec_def, vec_step); 2576 vec_def = make_ssa_name (vec_dest, new_stmt); 2577 gimple_assign_set_lhs (new_stmt, vec_def); 2578 2579 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT); 2580 if (!useless_type_conversion_p (resvectype, vectype)) 2581 { 2582 new_stmt = gimple_build_assign_with_ops 2583 (VIEW_CONVERT_EXPR, 2584 vect_get_new_vect_var (resvectype, vect_simple_var, 2585 "vec_iv_"), 2586 build1 (VIEW_CONVERT_EXPR, resvectype, 2587 gimple_assign_lhs (new_stmt)), NULL_TREE); 2588 gimple_assign_set_lhs (new_stmt, 2589 make_ssa_name 2590 (gimple_assign_lhs (new_stmt), new_stmt)); 2591 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT); 2592 } 2593 set_vinfo_for_stmt (new_stmt, 2594 new_stmt_vec_info (new_stmt, loop_vinfo, NULL)); 2595 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt; 2596 prev_stmt_vinfo = vinfo_for_stmt (new_stmt); 2597 } 2598 } 2599 2600 if (nested_in_vect_loop) 2601 { 2602 /* Find the loop-closed exit-phi of the induction, and record 2603 the final vector of induction results: */ 2604 exit_phi = NULL; 2605 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg) 2606 { 2607 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p)))) 2608 { 2609 exit_phi = USE_STMT (use_p); 2610 break; 2611 } 2612 } 2613 if (exit_phi) 2614 { 2615 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi); 2616 /* FORNOW. Currently not supporting the case that an inner-loop induction 2617 is not used in the outer-loop (i.e. only outside the outer-loop). */ 2618 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo) 2619 && !STMT_VINFO_LIVE_P (stmt_vinfo)); 2620 2621 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt; 2622 if (vect_print_dump_info (REPORT_DETAILS)) 2623 { 2624 fprintf (vect_dump, "vector of inductions after inner-loop:"); 2625 print_gimple_stmt (vect_dump, new_stmt, 0, TDF_SLIM); 2626 } 2627 } 2628 } 2629 2630 2631 if (vect_print_dump_info (REPORT_DETAILS)) 2632 { 2633 fprintf (vect_dump, "transform induction: created def-use cycle: "); 2634 print_gimple_stmt (vect_dump, induction_phi, 0, TDF_SLIM); 2635 fprintf (vect_dump, "\n"); 2636 print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (vec_def), 0, TDF_SLIM); 2637 } 2638 2639 STMT_VINFO_VEC_STMT (phi_info) = induction_phi; 2640 if (!useless_type_conversion_p (resvectype, vectype)) 2641 { 2642 new_stmt = gimple_build_assign_with_ops 2643 (VIEW_CONVERT_EXPR, 2644 vect_get_new_vect_var (resvectype, vect_simple_var, "vec_iv_"), 2645 build1 (VIEW_CONVERT_EXPR, resvectype, induc_def), NULL_TREE); 2646 induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt); 2647 gimple_assign_set_lhs (new_stmt, induc_def); 2648 si = gsi_start_bb (bb); 2649 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT); 2650 set_vinfo_for_stmt (new_stmt, 2651 new_stmt_vec_info (new_stmt, loop_vinfo, NULL)); 2652 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt)) 2653 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi)); 2654 } 2655 2656 return induc_def; 2657 } 2658 2659 2660 /* Function get_initial_def_for_reduction 2661 2662 Input: 2663 STMT - a stmt that performs a reduction operation in the loop. 2664 INIT_VAL - the initial value of the reduction variable 2665 2666 Output: 2667 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result 2668 of the reduction (used for adjusting the epilog - see below). 2669 Return a vector variable, initialized according to the operation that STMT 2670 performs. This vector will be used as the initial value of the 2671 vector of partial results. 2672 2673 Option1 (adjust in epilog): Initialize the vector as follows: 2674 add/bit or/xor: [0,0,...,0,0] 2675 mult/bit and: [1,1,...,1,1] 2676 min/max/cond_expr: [init_val,init_val,..,init_val,init_val] 2677 and when necessary (e.g. add/mult case) let the caller know 2678 that it needs to adjust the result by init_val. 2679 2680 Option2: Initialize the vector as follows: 2681 add/bit or/xor: [init_val,0,0,...,0] 2682 mult/bit and: [init_val,1,1,...,1] 2683 min/max/cond_expr: [init_val,init_val,...,init_val] 2684 and no adjustments are needed. 2685 2686 For example, for the following code: 2687 2688 s = init_val; 2689 for (i=0;i<n;i++) 2690 s = s + a[i]; 2691 2692 STMT is 's = s + a[i]', and the reduction variable is 's'. 2693 For a vector of 4 units, we want to return either [0,0,0,init_val], 2694 or [0,0,0,0] and let the caller know that it needs to adjust 2695 the result at the end by 'init_val'. 2696 2697 FORNOW, we are using the 'adjust in epilog' scheme, because this way the 2698 initialization vector is simpler (same element in all entries), if 2699 ADJUSTMENT_DEF is not NULL, and Option2 otherwise. 2700 2701 A cost model should help decide between these two schemes. */ 2702 2703 tree 2704 get_initial_def_for_reduction (gimple stmt, tree init_val, 2705 tree *adjustment_def) 2706 { 2707 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt); 2708 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo); 2709 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 2710 tree scalar_type = TREE_TYPE (init_val); 2711 tree vectype = get_vectype_for_scalar_type (scalar_type); 2712 int nunits; 2713 enum tree_code code = gimple_assign_rhs_code (stmt); 2714 tree def_for_init; 2715 tree init_def; 2716 tree t = NULL_TREE; 2717 int i; 2718 bool nested_in_vect_loop = false; 2719 tree init_value; 2720 REAL_VALUE_TYPE real_init_val = dconst0; 2721 int int_init_val = 0; 2722 gimple def_stmt = NULL; 2723 2724 gcc_assert (vectype); 2725 nunits = TYPE_VECTOR_SUBPARTS (vectype); 2726 2727 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type) 2728 || SCALAR_FLOAT_TYPE_P (scalar_type)); 2729 2730 if (nested_in_vect_loop_p (loop, stmt)) 2731 nested_in_vect_loop = true; 2732 else 2733 gcc_assert (loop == (gimple_bb (stmt))->loop_father); 2734 2735 /* In case of double reduction we only create a vector variable to be put 2736 in the reduction phi node. The actual statement creation is done in 2737 vect_create_epilog_for_reduction. */ 2738 if (adjustment_def && nested_in_vect_loop 2739 && TREE_CODE (init_val) == SSA_NAME 2740 && (def_stmt = SSA_NAME_DEF_STMT (init_val)) 2741 && gimple_code (def_stmt) == GIMPLE_PHI 2742 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)) 2743 && vinfo_for_stmt (def_stmt) 2744 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt)) 2745 == vect_double_reduction_def) 2746 { 2747 *adjustment_def = NULL; 2748 return vect_create_destination_var (init_val, vectype); 2749 } 2750 2751 if (TREE_CONSTANT (init_val)) 2752 { 2753 if (SCALAR_FLOAT_TYPE_P (scalar_type)) 2754 init_value = build_real (scalar_type, TREE_REAL_CST (init_val)); 2755 else 2756 init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val)); 2757 } 2758 else 2759 init_value = init_val; 2760 2761 switch (code) 2762 { 2763 case WIDEN_SUM_EXPR: 2764 case DOT_PROD_EXPR: 2765 case PLUS_EXPR: 2766 case MINUS_EXPR: 2767 case BIT_IOR_EXPR: 2768 case BIT_XOR_EXPR: 2769 case MULT_EXPR: 2770 case BIT_AND_EXPR: 2771 /* ADJUSMENT_DEF is NULL when called from 2772 vect_create_epilog_for_reduction to vectorize double reduction. */ 2773 if (adjustment_def) 2774 { 2775 if (nested_in_vect_loop) 2776 *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt, 2777 NULL); 2778 else 2779 *adjustment_def = init_val; 2780 } 2781 2782 if (code == MULT_EXPR) 2783 { 2784 real_init_val = dconst1; 2785 int_init_val = 1; 2786 } 2787 2788 if (code == BIT_AND_EXPR) 2789 int_init_val = -1; 2790 2791 if (SCALAR_FLOAT_TYPE_P (scalar_type)) 2792 def_for_init = build_real (scalar_type, real_init_val); 2793 else 2794 def_for_init = build_int_cst (scalar_type, int_init_val); 2795 2796 /* Create a vector of '0' or '1' except the first element. */ 2797 for (i = nunits - 2; i >= 0; --i) 2798 t = tree_cons (NULL_TREE, def_for_init, t); 2799 2800 /* Option1: the first element is '0' or '1' as well. */ 2801 if (adjustment_def) 2802 { 2803 t = tree_cons (NULL_TREE, def_for_init, t); 2804 init_def = build_vector (vectype, t); 2805 break; 2806 } 2807 2808 /* Option2: the first element is INIT_VAL. */ 2809 t = tree_cons (NULL_TREE, init_value, t); 2810 if (TREE_CONSTANT (init_val)) 2811 init_def = build_vector (vectype, t); 2812 else 2813 init_def = build_constructor_from_list (vectype, t); 2814 2815 break; 2816 2817 case MIN_EXPR: 2818 case MAX_EXPR: 2819 case COND_EXPR: 2820 if (adjustment_def) 2821 { 2822 *adjustment_def = NULL_TREE; 2823 init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL); 2824 break; 2825 } 2826 2827 for (i = nunits - 1; i >= 0; --i) 2828 t = tree_cons (NULL_TREE, init_value, t); 2829 2830 if (TREE_CONSTANT (init_val)) 2831 init_def = build_vector (vectype, t); 2832 else 2833 init_def = build_constructor_from_list (vectype, t); 2834 2835 break; 2836 2837 default: 2838 gcc_unreachable (); 2839 } 2840 2841 return init_def; 2842 } 2843 2844 2845 /* Function vect_create_epilog_for_reduction 2846 2847 Create code at the loop-epilog to finalize the result of a reduction 2848 computation. 2849 2850 VECT_DEF is a vector of partial results. 2851 REDUC_CODE is the tree-code for the epilog reduction. 2852 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the 2853 number of elements that we can fit in a vectype (nunits). In this case 2854 we have to generate more than one vector stmt - i.e - we need to "unroll" 2855 the vector stmt by a factor VF/nunits. For more details see documentation 2856 in vectorizable_operation. 2857 STMT is the scalar reduction stmt that is being vectorized. 2858 REDUCTION_PHI is the phi-node that carries the reduction computation. 2859 REDUC_INDEX is the index of the operand in the right hand side of the 2860 statement that is defined by REDUCTION_PHI. 2861 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled. 2862 2863 This function: 2864 1. Creates the reduction def-use cycle: sets the arguments for 2865 REDUCTION_PHI: 2866 The loop-entry argument is the vectorized initial-value of the reduction. 2867 The loop-latch argument is VECT_DEF - the vector of partial sums. 2868 2. "Reduces" the vector of partial results VECT_DEF into a single result, 2869 by applying the operation specified by REDUC_CODE if available, or by 2870 other means (whole-vector shifts or a scalar loop). 2871 The function also creates a new phi node at the loop exit to preserve 2872 loop-closed form, as illustrated below. 2873 2874 The flow at the entry to this function: 2875 2876 loop: 2877 vec_def = phi <null, null> # REDUCTION_PHI 2878 VECT_DEF = vector_stmt # vectorized form of STMT 2879 s_loop = scalar_stmt # (scalar) STMT 2880 loop_exit: 2881 s_out0 = phi <s_loop> # (scalar) EXIT_PHI 2882 use <s_out0> 2883 use <s_out0> 2884 2885 The above is transformed by this function into: 2886 2887 loop: 2888 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI 2889 VECT_DEF = vector_stmt # vectorized form of STMT 2890 s_loop = scalar_stmt # (scalar) STMT 2891 loop_exit: 2892 s_out0 = phi <s_loop> # (scalar) EXIT_PHI 2893 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI 2894 v_out2 = reduce <v_out1> 2895 s_out3 = extract_field <v_out2, 0> 2896 s_out4 = adjust_result <s_out3> 2897 use <s_out4> 2898 use <s_out4> 2899 */ 2900 2901 static void 2902 vect_create_epilog_for_reduction (tree vect_def, gimple stmt, 2903 int ncopies, 2904 enum tree_code reduc_code, 2905 gimple reduction_phi, 2906 int reduc_index, 2907 bool double_reduc) 2908 { 2909 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 2910 stmt_vec_info prev_phi_info; 2911 tree vectype; 2912 enum machine_mode mode; 2913 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 2914 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL; 2915 basic_block exit_bb; 2916 tree scalar_dest; 2917 tree scalar_type; 2918 gimple new_phi = NULL, phi; 2919 gimple_stmt_iterator exit_gsi; 2920 tree vec_dest; 2921 tree new_temp = NULL_TREE; 2922 tree new_name; 2923 gimple epilog_stmt = NULL; 2924 tree new_scalar_dest, new_dest; 2925 gimple exit_phi; 2926 tree bitsize, bitpos; 2927 enum tree_code code = gimple_assign_rhs_code (stmt); 2928 tree adjustment_def; 2929 tree vec_initial_def, def; 2930 tree orig_name; 2931 imm_use_iterator imm_iter; 2932 use_operand_p use_p; 2933 bool extract_scalar_result = false; 2934 tree reduction_op, expr; 2935 gimple orig_stmt; 2936 gimple use_stmt; 2937 bool nested_in_vect_loop = false; 2938 VEC(gimple,heap) *phis = NULL; 2939 enum vect_def_type dt = vect_unknown_def_type; 2940 int j, i; 2941 2942 if (nested_in_vect_loop_p (loop, stmt)) 2943 { 2944 outer_loop = loop; 2945 loop = loop->inner; 2946 nested_in_vect_loop = true; 2947 } 2948 2949 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt))) 2950 { 2951 case GIMPLE_SINGLE_RHS: 2952 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) 2953 == ternary_op); 2954 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index); 2955 break; 2956 case GIMPLE_UNARY_RHS: 2957 reduction_op = gimple_assign_rhs1 (stmt); 2958 break; 2959 case GIMPLE_BINARY_RHS: 2960 reduction_op = reduc_index ? 2961 gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt); 2962 break; 2963 default: 2964 gcc_unreachable (); 2965 } 2966 2967 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op)); 2968 gcc_assert (vectype); 2969 mode = TYPE_MODE (vectype); 2970 2971 /*** 1. Create the reduction def-use cycle ***/ 2972 2973 /* For the case of reduction, vect_get_vec_def_for_operand returns 2974 the scalar def before the loop, that defines the initial value 2975 of the reduction variable. */ 2976 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt, 2977 &adjustment_def); 2978 2979 phi = reduction_phi; 2980 def = vect_def; 2981 for (j = 0; j < ncopies; j++) 2982 { 2983 /* 1.1 set the loop-entry arg of the reduction-phi: */ 2984 add_phi_arg (phi, vec_initial_def, loop_preheader_edge (loop), 2985 UNKNOWN_LOCATION); 2986 2987 /* 1.2 set the loop-latch arg for the reduction-phi: */ 2988 if (j > 0) 2989 def = vect_get_vec_def_for_stmt_copy (dt, def); 2990 add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION); 2991 2992 if (vect_print_dump_info (REPORT_DETAILS)) 2993 { 2994 fprintf (vect_dump, "transform reduction: created def-use cycle: "); 2995 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM); 2996 fprintf (vect_dump, "\n"); 2997 print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (def), 0, TDF_SLIM); 2998 } 2999 3000 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)); 3001 } 3002 3003 /*** 2. Create epilog code 3004 The reduction epilog code operates across the elements of the vector 3005 of partial results computed by the vectorized loop. 3006 The reduction epilog code consists of: 3007 step 1: compute the scalar result in a vector (v_out2) 3008 step 2: extract the scalar result (s_out3) from the vector (v_out2) 3009 step 3: adjust the scalar result (s_out3) if needed. 3010 3011 Step 1 can be accomplished using one the following three schemes: 3012 (scheme 1) using reduc_code, if available. 3013 (scheme 2) using whole-vector shifts, if available. 3014 (scheme 3) using a scalar loop. In this case steps 1+2 above are 3015 combined. 3016 3017 The overall epilog code looks like this: 3018 3019 s_out0 = phi <s_loop> # original EXIT_PHI 3020 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI 3021 v_out2 = reduce <v_out1> # step 1 3022 s_out3 = extract_field <v_out2, 0> # step 2 3023 s_out4 = adjust_result <s_out3> # step 3 3024 3025 (step 3 is optional, and steps 1 and 2 may be combined). 3026 Lastly, the uses of s_out0 are replaced by s_out4. 3027 3028 ***/ 3029 3030 /* 2.1 Create new loop-exit-phi to preserve loop-closed form: 3031 v_out1 = phi <v_loop> */ 3032 3033 exit_bb = single_exit (loop)->dest; 3034 def = vect_def; 3035 prev_phi_info = NULL; 3036 for (j = 0; j < ncopies; j++) 3037 { 3038 phi = create_phi_node (SSA_NAME_VAR (vect_def), exit_bb); 3039 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL)); 3040 if (j == 0) 3041 new_phi = phi; 3042 else 3043 { 3044 def = vect_get_vec_def_for_stmt_copy (dt, def); 3045 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi; 3046 } 3047 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def); 3048 prev_phi_info = vinfo_for_stmt (phi); 3049 } 3050 3051 exit_gsi = gsi_after_labels (exit_bb); 3052 3053 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3 3054 (i.e. when reduc_code is not available) and in the final adjustment 3055 code (if needed). Also get the original scalar reduction variable as 3056 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it 3057 represents a reduction pattern), the tree-code and scalar-def are 3058 taken from the original stmt that the pattern-stmt (STMT) replaces. 3059 Otherwise (it is a regular reduction) - the tree-code and scalar-def 3060 are taken from STMT. */ 3061 3062 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info); 3063 if (!orig_stmt) 3064 { 3065 /* Regular reduction */ 3066 orig_stmt = stmt; 3067 } 3068 else 3069 { 3070 /* Reduction pattern */ 3071 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt); 3072 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo)); 3073 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt); 3074 } 3075 3076 code = gimple_assign_rhs_code (orig_stmt); 3077 scalar_dest = gimple_assign_lhs (orig_stmt); 3078 scalar_type = TREE_TYPE (scalar_dest); 3079 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL); 3080 bitsize = TYPE_SIZE (scalar_type); 3081 3082 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore, 3083 partial results are added and not subtracted. */ 3084 if (code == MINUS_EXPR) 3085 code = PLUS_EXPR; 3086 3087 /* In case this is a reduction in an inner-loop while vectorizing an outer 3088 loop - we don't need to extract a single scalar result at the end of the 3089 inner-loop (unless it is double reduction, i.e., the use of reduction is 3090 outside the outer-loop). The final vector of partial results will be used 3091 in the vectorized outer-loop, or reduced to a scalar result at the end of 3092 the outer-loop. */ 3093 if (nested_in_vect_loop && !double_reduc) 3094 goto vect_finalize_reduction; 3095 3096 /* The epilogue is created for the outer-loop, i.e., for the loop being 3097 vectorized. */ 3098 if (double_reduc) 3099 loop = outer_loop; 3100 3101 /* FORNOW */ 3102 gcc_assert (ncopies == 1); 3103 3104 /* 2.3 Create the reduction code, using one of the three schemes described 3105 above. */ 3106 3107 if (reduc_code != ERROR_MARK) 3108 { 3109 tree tmp; 3110 3111 /*** Case 1: Create: 3112 v_out2 = reduc_expr <v_out1> */ 3113 3114 if (vect_print_dump_info (REPORT_DETAILS)) 3115 fprintf (vect_dump, "Reduce using direct vector reduction."); 3116 3117 vec_dest = vect_create_destination_var (scalar_dest, vectype); 3118 tmp = build1 (reduc_code, vectype, PHI_RESULT (new_phi)); 3119 epilog_stmt = gimple_build_assign (vec_dest, tmp); 3120 new_temp = make_ssa_name (vec_dest, epilog_stmt); 3121 gimple_assign_set_lhs (epilog_stmt, new_temp); 3122 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 3123 3124 extract_scalar_result = true; 3125 } 3126 else 3127 { 3128 enum tree_code shift_code = ERROR_MARK; 3129 bool have_whole_vector_shift = true; 3130 int bit_offset; 3131 int element_bitsize = tree_low_cst (bitsize, 1); 3132 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1); 3133 tree vec_temp; 3134 3135 if (optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing) 3136 shift_code = VEC_RSHIFT_EXPR; 3137 else 3138 have_whole_vector_shift = false; 3139 3140 /* Regardless of whether we have a whole vector shift, if we're 3141 emulating the operation via tree-vect-generic, we don't want 3142 to use it. Only the first round of the reduction is likely 3143 to still be profitable via emulation. */ 3144 /* ??? It might be better to emit a reduction tree code here, so that 3145 tree-vect-generic can expand the first round via bit tricks. */ 3146 if (!VECTOR_MODE_P (mode)) 3147 have_whole_vector_shift = false; 3148 else 3149 { 3150 optab optab = optab_for_tree_code (code, vectype, optab_default); 3151 if (optab_handler (optab, mode)->insn_code == CODE_FOR_nothing) 3152 have_whole_vector_shift = false; 3153 } 3154 3155 if (have_whole_vector_shift) 3156 { 3157 /*** Case 2: Create: 3158 for (offset = VS/2; offset >= element_size; offset/=2) 3159 { 3160 Create: va' = vec_shift <va, offset> 3161 Create: va = vop <va, va'> 3162 } */ 3163 3164 if (vect_print_dump_info (REPORT_DETAILS)) 3165 fprintf (vect_dump, "Reduce using vector shifts"); 3166 3167 vec_dest = vect_create_destination_var (scalar_dest, vectype); 3168 new_temp = PHI_RESULT (new_phi); 3169 3170 for (bit_offset = vec_size_in_bits/2; 3171 bit_offset >= element_bitsize; 3172 bit_offset /= 2) 3173 { 3174 tree bitpos = size_int (bit_offset); 3175 3176 epilog_stmt = gimple_build_assign_with_ops (shift_code, vec_dest, 3177 new_temp, bitpos); 3178 new_name = make_ssa_name (vec_dest, epilog_stmt); 3179 gimple_assign_set_lhs (epilog_stmt, new_name); 3180 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 3181 3182 epilog_stmt = gimple_build_assign_with_ops (code, vec_dest, 3183 new_name, new_temp); 3184 new_temp = make_ssa_name (vec_dest, epilog_stmt); 3185 gimple_assign_set_lhs (epilog_stmt, new_temp); 3186 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 3187 } 3188 3189 extract_scalar_result = true; 3190 } 3191 else 3192 { 3193 tree rhs; 3194 3195 /*** Case 3: Create: 3196 s = extract_field <v_out2, 0> 3197 for (offset = element_size; 3198 offset < vector_size; 3199 offset += element_size;) 3200 { 3201 Create: s' = extract_field <v_out2, offset> 3202 Create: s = op <s, s'> 3203 } */ 3204 3205 if (vect_print_dump_info (REPORT_DETAILS)) 3206 fprintf (vect_dump, "Reduce using scalar code. "); 3207 3208 vec_temp = PHI_RESULT (new_phi); 3209 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1); 3210 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize, 3211 bitsize_zero_node); 3212 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs); 3213 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); 3214 gimple_assign_set_lhs (epilog_stmt, new_temp); 3215 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 3216 3217 for (bit_offset = element_bitsize; 3218 bit_offset < vec_size_in_bits; 3219 bit_offset += element_bitsize) 3220 { 3221 tree bitpos = bitsize_int (bit_offset); 3222 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize, 3223 bitpos); 3224 3225 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs); 3226 new_name = make_ssa_name (new_scalar_dest, epilog_stmt); 3227 gimple_assign_set_lhs (epilog_stmt, new_name); 3228 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 3229 3230 epilog_stmt = gimple_build_assign_with_ops (code, 3231 new_scalar_dest, 3232 new_name, new_temp); 3233 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); 3234 gimple_assign_set_lhs (epilog_stmt, new_temp); 3235 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 3236 } 3237 3238 extract_scalar_result = false; 3239 } 3240 } 3241 3242 /* 2.4 Extract the final scalar result. Create: 3243 s_out3 = extract_field <v_out2, bitpos> */ 3244 3245 if (extract_scalar_result) 3246 { 3247 tree rhs; 3248 3249 gcc_assert (!nested_in_vect_loop || double_reduc); 3250 if (vect_print_dump_info (REPORT_DETAILS)) 3251 fprintf (vect_dump, "extract scalar result"); 3252 3253 if (BYTES_BIG_ENDIAN) 3254 bitpos = size_binop (MULT_EXPR, 3255 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1), 3256 TYPE_SIZE (scalar_type)); 3257 else 3258 bitpos = bitsize_zero_node; 3259 3260 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos); 3261 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs); 3262 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); 3263 gimple_assign_set_lhs (epilog_stmt, new_temp); 3264 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 3265 } 3266 3267 vect_finalize_reduction: 3268 3269 if (double_reduc) 3270 loop = loop->inner; 3271 3272 /* 2.5 Adjust the final result by the initial value of the reduction 3273 variable. (When such adjustment is not needed, then 3274 'adjustment_def' is zero). For example, if code is PLUS we create: 3275 new_temp = loop_exit_def + adjustment_def */ 3276 3277 if (adjustment_def) 3278 { 3279 if (nested_in_vect_loop) 3280 { 3281 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE); 3282 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def); 3283 new_dest = vect_create_destination_var (scalar_dest, vectype); 3284 } 3285 else 3286 { 3287 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE); 3288 expr = build2 (code, scalar_type, new_temp, adjustment_def); 3289 new_dest = vect_create_destination_var (scalar_dest, scalar_type); 3290 } 3291 3292 epilog_stmt = gimple_build_assign (new_dest, expr); 3293 new_temp = make_ssa_name (new_dest, epilog_stmt); 3294 gimple_assign_set_lhs (epilog_stmt, new_temp); 3295 SSA_NAME_DEF_STMT (new_temp) = epilog_stmt; 3296 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 3297 } 3298 3299 3300 /* 2.6 Handle the loop-exit phi */ 3301 3302 /* Replace uses of s_out0 with uses of s_out3: 3303 Find the loop-closed-use at the loop exit of the original scalar result. 3304 (The reduction result is expected to have two immediate uses - one at the 3305 latch block, and one at the loop exit). */ 3306 phis = VEC_alloc (gimple, heap, 10); 3307 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest) 3308 { 3309 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))) 3310 { 3311 exit_phi = USE_STMT (use_p); 3312 VEC_quick_push (gimple, phis, exit_phi); 3313 } 3314 } 3315 3316 /* We expect to have found an exit_phi because of loop-closed-ssa form. */ 3317 gcc_assert (!VEC_empty (gimple, phis)); 3318 3319 for (i = 0; VEC_iterate (gimple, phis, i, exit_phi); i++) 3320 { 3321 if (nested_in_vect_loop) 3322 { 3323 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi); 3324 gimple vect_phi; 3325 3326 /* FORNOW. Currently not supporting the case that an inner-loop 3327 reduction is not used in the outer-loop (but only outside the 3328 outer-loop), unless it is double reduction. */ 3329 gcc_assert ((STMT_VINFO_RELEVANT_P (stmt_vinfo) 3330 && !STMT_VINFO_LIVE_P (stmt_vinfo)) || double_reduc); 3331 3332 epilog_stmt = adjustment_def ? epilog_stmt : new_phi; 3333 STMT_VINFO_VEC_STMT (stmt_vinfo) = epilog_stmt; 3334 set_vinfo_for_stmt (epilog_stmt, 3335 new_stmt_vec_info (epilog_stmt, loop_vinfo, 3336 NULL)); 3337 if (adjustment_def) 3338 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) = 3339 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi)); 3340 3341 if (!double_reduc 3342 || STMT_VINFO_DEF_TYPE (stmt_vinfo) != vect_double_reduction_def) 3343 continue; 3344 3345 /* Handle double reduction: 3346 3347 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop) 3348 stmt2: s3 = phi <s1, s4> - (regular) reduction phi (inner loop) 3349 stmt3: s4 = use (s3) - (regular) reduction stmt (inner loop) 3350 stmt4: s2 = phi <s4> - double reduction stmt (outer loop) 3351 3352 At that point the regular reduction (stmt2 and stmt3) is already 3353 vectorized, as well as the exit phi node, stmt4. 3354 Here we vectorize the phi node of double reduction, stmt1, and 3355 update all relevant statements. */ 3356 3357 /* Go through all the uses of s2 to find double reduction phi node, 3358 i.e., stmt1 above. */ 3359 orig_name = PHI_RESULT (exit_phi); 3360 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name) 3361 { 3362 stmt_vec_info use_stmt_vinfo = vinfo_for_stmt (use_stmt); 3363 stmt_vec_info new_phi_vinfo; 3364 tree vect_phi_init, preheader_arg, vect_phi_res, init_def; 3365 basic_block bb = gimple_bb (use_stmt); 3366 gimple use; 3367 3368 /* Check that USE_STMT is really double reduction phi node. */ 3369 if (gimple_code (use_stmt) != GIMPLE_PHI 3370 || gimple_phi_num_args (use_stmt) != 2 3371 || !use_stmt_vinfo 3372 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo) 3373 != vect_double_reduction_def 3374 || bb->loop_father != outer_loop) 3375 continue; 3376 3377 /* Create vector phi node for double reduction: 3378 vs1 = phi <vs0, vs2> 3379 vs1 was created previously in this function by a call to 3380 vect_get_vec_def_for_operand and is stored in vec_initial_def; 3381 vs2 is defined by EPILOG_STMT, the vectorized EXIT_PHI; 3382 vs0 is created here. */ 3383 3384 /* Create vector phi node. */ 3385 vect_phi = create_phi_node (vec_initial_def, bb); 3386 new_phi_vinfo = new_stmt_vec_info (vect_phi, 3387 loop_vec_info_for_loop (outer_loop), NULL); 3388 set_vinfo_for_stmt (vect_phi, new_phi_vinfo); 3389 3390 /* Create vs0 - initial def of the double reduction phi. */ 3391 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt, 3392 loop_preheader_edge (outer_loop)); 3393 init_def = get_initial_def_for_reduction (stmt, preheader_arg, 3394 NULL); 3395 vect_phi_init = vect_init_vector (use_stmt, init_def, vectype, 3396 NULL); 3397 3398 /* Update phi node arguments with vs0 and vs2. */ 3399 add_phi_arg (vect_phi, vect_phi_init, 3400 loop_preheader_edge (outer_loop), UNKNOWN_LOCATION); 3401 add_phi_arg (vect_phi, PHI_RESULT (epilog_stmt), 3402 loop_latch_edge (outer_loop), UNKNOWN_LOCATION); 3403 if (vect_print_dump_info (REPORT_DETAILS)) 3404 { 3405 fprintf (vect_dump, "created double reduction phi node: "); 3406 print_gimple_stmt (vect_dump, vect_phi, 0, TDF_SLIM); 3407 } 3408 3409 vect_phi_res = PHI_RESULT (vect_phi); 3410 3411 /* Replace the use, i.e., set the correct vs1 in the regular 3412 reduction phi node. FORNOW, NCOPIES is always 1, so the loop 3413 is redundant. */ 3414 use = reduction_phi; 3415 for (j = 0; j < ncopies; j++) 3416 { 3417 edge pr_edge = loop_preheader_edge (loop); 3418 SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res); 3419 use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use)); 3420 } 3421 } 3422 } 3423 3424 /* Replace the uses: */ 3425 orig_name = PHI_RESULT (exit_phi); 3426 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name) 3427 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter) 3428 SET_USE (use_p, new_temp); 3429 } 3430 3431 VEC_free (gimple, heap, phis); 3432 } 3433 3434 3435 /* Function vectorizable_reduction. 3436 3437 Check if STMT performs a reduction operation that can be vectorized. 3438 If VEC_STMT is also passed, vectorize the STMT: create a vectorized 3439 stmt to replace it, put it in VEC_STMT, and insert it at GSI. 3440 Return FALSE if not a vectorizable STMT, TRUE otherwise. 3441 3442 This function also handles reduction idioms (patterns) that have been 3443 recognized in advance during vect_pattern_recog. In this case, STMT may be 3444 of this form: 3445 X = pattern_expr (arg0, arg1, ..., X) 3446 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original 3447 sequence that had been detected and replaced by the pattern-stmt (STMT). 3448 3449 In some cases of reduction patterns, the type of the reduction variable X is 3450 different than the type of the other arguments of STMT. 3451 In such cases, the vectype that is used when transforming STMT into a vector 3452 stmt is different than the vectype that is used to determine the 3453 vectorization factor, because it consists of a different number of elements 3454 than the actual number of elements that are being operated upon in parallel. 3455 3456 For example, consider an accumulation of shorts into an int accumulator. 3457 On some targets it's possible to vectorize this pattern operating on 8 3458 shorts at a time (hence, the vectype for purposes of determining the 3459 vectorization factor should be V8HI); on the other hand, the vectype that 3460 is used to create the vector form is actually V4SI (the type of the result). 3461 3462 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that 3463 indicates what is the actual level of parallelism (V8HI in the example), so 3464 that the right vectorization factor would be derived. This vectype 3465 corresponds to the type of arguments to the reduction stmt, and should *NOT* 3466 be used to create the vectorized stmt. The right vectype for the vectorized 3467 stmt is obtained from the type of the result X: 3468 get_vectype_for_scalar_type (TREE_TYPE (X)) 3469 3470 This means that, contrary to "regular" reductions (or "regular" stmts in 3471 general), the following equation: 3472 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X)) 3473 does *NOT* necessarily hold for reduction patterns. */ 3474 3475 bool 3476 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi, 3477 gimple *vec_stmt) 3478 { 3479 tree vec_dest; 3480 tree scalar_dest; 3481 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE; 3482 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 3483 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 3484 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 3485 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 3486 enum tree_code code, orig_code, epilog_reduc_code; 3487 enum machine_mode vec_mode; 3488 int op_type; 3489 optab optab, reduc_optab; 3490 tree new_temp = NULL_TREE; 3491 tree def; 3492 gimple def_stmt; 3493 enum vect_def_type dt; 3494 gimple new_phi = NULL; 3495 tree scalar_type; 3496 bool is_simple_use; 3497 gimple orig_stmt; 3498 stmt_vec_info orig_stmt_info; 3499 tree expr = NULL_TREE; 3500 int i; 3501 int nunits = TYPE_VECTOR_SUBPARTS (vectype); 3502 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits; 3503 int epilog_copies; 3504 stmt_vec_info prev_stmt_info, prev_phi_info; 3505 gimple first_phi = NULL; 3506 bool single_defuse_cycle = false; 3507 tree reduc_def = NULL_TREE; 3508 gimple new_stmt = NULL; 3509 int j; 3510 tree ops[3]; 3511 bool nested_cycle = false, found_nested_cycle_def = false; 3512 gimple reduc_def_stmt = NULL; 3513 /* The default is that the reduction variable is the last in statement. */ 3514 int reduc_index = 2; 3515 bool double_reduc = false, dummy; 3516 basic_block def_bb; 3517 struct loop * def_stmt_loop, *outer_loop = NULL; 3518 tree def_arg; 3519 gimple def_arg_stmt; 3520 3521 if (nested_in_vect_loop_p (loop, stmt)) 3522 { 3523 outer_loop = loop; 3524 loop = loop->inner; 3525 nested_cycle = true; 3526 } 3527 3528 gcc_assert (ncopies >= 1); 3529 3530 /* FORNOW: SLP not supported. */ 3531 if (STMT_SLP_TYPE (stmt_info)) 3532 return false; 3533 3534 /* 1. Is vectorizable reduction? */ 3535 /* Not supportable if the reduction variable is used in the loop. */ 3536 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer) 3537 return false; 3538 3539 /* Reductions that are not used even in an enclosing outer-loop, 3540 are expected to be "live" (used out of the loop). */ 3541 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope 3542 && !STMT_VINFO_LIVE_P (stmt_info)) 3543 return false; 3544 3545 /* Make sure it was already recognized as a reduction computation. */ 3546 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def 3547 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle) 3548 return false; 3549 3550 /* 2. Has this been recognized as a reduction pattern? 3551 3552 Check if STMT represents a pattern that has been recognized 3553 in earlier analysis stages. For stmts that represent a pattern, 3554 the STMT_VINFO_RELATED_STMT field records the last stmt in 3555 the original sequence that constitutes the pattern. */ 3556 3557 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info); 3558 if (orig_stmt) 3559 { 3560 orig_stmt_info = vinfo_for_stmt (orig_stmt); 3561 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt); 3562 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)); 3563 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info)); 3564 } 3565 3566 /* 3. Check the operands of the operation. The first operands are defined 3567 inside the loop body. The last operand is the reduction variable, 3568 which is defined by the loop-header-phi. */ 3569 3570 gcc_assert (is_gimple_assign (stmt)); 3571 3572 /* Flatten RHS */ 3573 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt))) 3574 { 3575 case GIMPLE_SINGLE_RHS: 3576 op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)); 3577 if (op_type == ternary_op) 3578 { 3579 tree rhs = gimple_assign_rhs1 (stmt); 3580 ops[0] = TREE_OPERAND (rhs, 0); 3581 ops[1] = TREE_OPERAND (rhs, 1); 3582 ops[2] = TREE_OPERAND (rhs, 2); 3583 code = TREE_CODE (rhs); 3584 } 3585 else 3586 return false; 3587 break; 3588 3589 case GIMPLE_BINARY_RHS: 3590 code = gimple_assign_rhs_code (stmt); 3591 op_type = TREE_CODE_LENGTH (code); 3592 gcc_assert (op_type == binary_op); 3593 ops[0] = gimple_assign_rhs1 (stmt); 3594 ops[1] = gimple_assign_rhs2 (stmt); 3595 break; 3596 3597 case GIMPLE_UNARY_RHS: 3598 return false; 3599 3600 default: 3601 gcc_unreachable (); 3602 } 3603 3604 scalar_dest = gimple_assign_lhs (stmt); 3605 scalar_type = TREE_TYPE (scalar_dest); 3606 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type) 3607 && !SCALAR_FLOAT_TYPE_P (scalar_type)) 3608 return false; 3609 3610 /* All uses but the last are expected to be defined in the loop. 3611 The last use is the reduction variable. In case of nested cycle this 3612 assumption is not true: we use reduc_index to record the index of the 3613 reduction variable. */ 3614 for (i = 0; i < op_type-1; i++) 3615 { 3616 /* The condition of COND_EXPR is checked in vectorizable_condition(). */ 3617 if (i == 0 && code == COND_EXPR) 3618 continue; 3619 3620 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, NULL, &def_stmt, 3621 &def, &dt); 3622 gcc_assert (is_simple_use); 3623 if (dt != vect_internal_def 3624 && dt != vect_external_def 3625 && dt != vect_constant_def 3626 && dt != vect_induction_def 3627 && !(dt == vect_nested_cycle && nested_cycle)) 3628 return false; 3629 3630 if (dt == vect_nested_cycle) 3631 { 3632 found_nested_cycle_def = true; 3633 reduc_def_stmt = def_stmt; 3634 reduc_index = i; 3635 } 3636 } 3637 3638 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, NULL, &def_stmt, 3639 &def, &dt); 3640 gcc_assert (is_simple_use); 3641 gcc_assert (dt == vect_reduction_def 3642 || dt == vect_nested_cycle 3643 || ((dt == vect_internal_def || dt == vect_external_def 3644 || dt == vect_constant_def || dt == vect_induction_def) 3645 && nested_cycle && found_nested_cycle_def)); 3646 if (!found_nested_cycle_def) 3647 reduc_def_stmt = def_stmt; 3648 3649 gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI); 3650 if (orig_stmt) 3651 gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo, 3652 reduc_def_stmt, 3653 !nested_cycle, 3654 &dummy)); 3655 else 3656 gcc_assert (stmt == vect_is_simple_reduction (loop_vinfo, reduc_def_stmt, 3657 !nested_cycle, &dummy)); 3658 3659 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt))) 3660 return false; 3661 3662 vec_mode = TYPE_MODE (vectype); 3663 3664 if (code == COND_EXPR) 3665 { 3666 if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0)) 3667 { 3668 if (vect_print_dump_info (REPORT_DETAILS)) 3669 fprintf (vect_dump, "unsupported condition in reduction"); 3670 3671 return false; 3672 } 3673 } 3674 else 3675 { 3676 /* 4. Supportable by target? */ 3677 3678 /* 4.1. check support for the operation in the loop */ 3679 optab = optab_for_tree_code (code, vectype, optab_default); 3680 if (!optab) 3681 { 3682 if (vect_print_dump_info (REPORT_DETAILS)) 3683 fprintf (vect_dump, "no optab."); 3684 3685 return false; 3686 } 3687 3688 if (optab_handler (optab, vec_mode)->insn_code == CODE_FOR_nothing) 3689 { 3690 if (vect_print_dump_info (REPORT_DETAILS)) 3691 fprintf (vect_dump, "op not supported by target."); 3692 3693 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD 3694 || LOOP_VINFO_VECT_FACTOR (loop_vinfo) 3695 < vect_min_worthwhile_factor (code)) 3696 return false; 3697 3698 if (vect_print_dump_info (REPORT_DETAILS)) 3699 fprintf (vect_dump, "proceeding using word mode."); 3700 } 3701 3702 /* Worthwhile without SIMD support? */ 3703 if (!VECTOR_MODE_P (TYPE_MODE (vectype)) 3704 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) 3705 < vect_min_worthwhile_factor (code)) 3706 { 3707 if (vect_print_dump_info (REPORT_DETAILS)) 3708 fprintf (vect_dump, "not worthwhile without SIMD support."); 3709 3710 return false; 3711 } 3712 } 3713 3714 /* 4.2. Check support for the epilog operation. 3715 3716 If STMT represents a reduction pattern, then the type of the 3717 reduction variable may be different than the type of the rest 3718 of the arguments. For example, consider the case of accumulation 3719 of shorts into an int accumulator; The original code: 3720 S1: int_a = (int) short_a; 3721 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>; 3722 3723 was replaced with: 3724 STMT: int_acc = widen_sum <short_a, int_acc> 3725 3726 This means that: 3727 1. The tree-code that is used to create the vector operation in the 3728 epilog code (that reduces the partial results) is not the 3729 tree-code of STMT, but is rather the tree-code of the original 3730 stmt from the pattern that STMT is replacing. I.e, in the example 3731 above we want to use 'widen_sum' in the loop, but 'plus' in the 3732 epilog. 3733 2. The type (mode) we use to check available target support 3734 for the vector operation to be created in the *epilog*, is 3735 determined by the type of the reduction variable (in the example 3736 above we'd check this: plus_optab[vect_int_mode]). 3737 However the type (mode) we use to check available target support 3738 for the vector operation to be created *inside the loop*, is 3739 determined by the type of the other arguments to STMT (in the 3740 example we'd check this: widen_sum_optab[vect_short_mode]). 3741 3742 This is contrary to "regular" reductions, in which the types of all 3743 the arguments are the same as the type of the reduction variable. 3744 For "regular" reductions we can therefore use the same vector type 3745 (and also the same tree-code) when generating the epilog code and 3746 when generating the code inside the loop. */ 3747 3748 if (orig_stmt) 3749 { 3750 /* This is a reduction pattern: get the vectype from the type of the 3751 reduction variable, and get the tree-code from orig_stmt. */ 3752 orig_code = gimple_assign_rhs_code (orig_stmt); 3753 vectype = get_vectype_for_scalar_type (TREE_TYPE (def)); 3754 if (!vectype) 3755 { 3756 if (vect_print_dump_info (REPORT_DETAILS)) 3757 { 3758 fprintf (vect_dump, "unsupported data-type "); 3759 print_generic_expr (vect_dump, TREE_TYPE (def), TDF_SLIM); 3760 } 3761 return false; 3762 } 3763 3764 vec_mode = TYPE_MODE (vectype); 3765 } 3766 else 3767 { 3768 /* Regular reduction: use the same vectype and tree-code as used for 3769 the vector code inside the loop can be used for the epilog code. */ 3770 orig_code = code; 3771 } 3772 3773 if (nested_cycle) 3774 { 3775 def_bb = gimple_bb (reduc_def_stmt); 3776 def_stmt_loop = def_bb->loop_father; 3777 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt, 3778 loop_preheader_edge (def_stmt_loop)); 3779 if (TREE_CODE (def_arg) == SSA_NAME 3780 && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg)) 3781 && gimple_code (def_arg_stmt) == GIMPLE_PHI 3782 && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt)) 3783 && vinfo_for_stmt (def_arg_stmt) 3784 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt)) 3785 == vect_double_reduction_def) 3786 double_reduc = true; 3787 } 3788 3789 epilog_reduc_code = ERROR_MARK; 3790 if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code)) 3791 { 3792 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype, 3793 optab_default); 3794 if (!reduc_optab) 3795 { 3796 if (vect_print_dump_info (REPORT_DETAILS)) 3797 fprintf (vect_dump, "no optab for reduction."); 3798 3799 epilog_reduc_code = ERROR_MARK; 3800 } 3801 3802 if (reduc_optab 3803 && optab_handler (reduc_optab, vec_mode)->insn_code 3804 == CODE_FOR_nothing) 3805 { 3806 if (vect_print_dump_info (REPORT_DETAILS)) 3807 fprintf (vect_dump, "reduc op not supported by target."); 3808 3809 epilog_reduc_code = ERROR_MARK; 3810 } 3811 } 3812 else 3813 { 3814 if (!nested_cycle || double_reduc) 3815 { 3816 if (vect_print_dump_info (REPORT_DETAILS)) 3817 fprintf (vect_dump, "no reduc code for scalar code."); 3818 3819 return false; 3820 } 3821 } 3822 3823 if (double_reduc && ncopies > 1) 3824 { 3825 if (vect_print_dump_info (REPORT_DETAILS)) 3826 fprintf (vect_dump, "multiple types in double reduction"); 3827 3828 return false; 3829 } 3830 3831 if (!vec_stmt) /* transformation not required. */ 3832 { 3833 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type; 3834 if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies)) 3835 return false; 3836 return true; 3837 } 3838 3839 /** Transform. **/ 3840 3841 if (vect_print_dump_info (REPORT_DETAILS)) 3842 fprintf (vect_dump, "transform reduction."); 3843 3844 /* FORNOW: Multiple types are not supported for condition. */ 3845 if (code == COND_EXPR) 3846 gcc_assert (ncopies == 1); 3847 3848 /* Create the destination vector */ 3849 vec_dest = vect_create_destination_var (scalar_dest, vectype); 3850 3851 /* In case the vectorization factor (VF) is bigger than the number 3852 of elements that we can fit in a vectype (nunits), we have to generate 3853 more than one vector stmt - i.e - we need to "unroll" the 3854 vector stmt by a factor VF/nunits. For more details see documentation 3855 in vectorizable_operation. */ 3856 3857 /* If the reduction is used in an outer loop we need to generate 3858 VF intermediate results, like so (e.g. for ncopies=2): 3859 r0 = phi (init, r0) 3860 r1 = phi (init, r1) 3861 r0 = x0 + r0; 3862 r1 = x1 + r1; 3863 (i.e. we generate VF results in 2 registers). 3864 In this case we have a separate def-use cycle for each copy, and therefore 3865 for each copy we get the vector def for the reduction variable from the 3866 respective phi node created for this copy. 3867 3868 Otherwise (the reduction is unused in the loop nest), we can combine 3869 together intermediate results, like so (e.g. for ncopies=2): 3870 r = phi (init, r) 3871 r = x0 + r; 3872 r = x1 + r; 3873 (i.e. we generate VF/2 results in a single register). 3874 In this case for each copy we get the vector def for the reduction variable 3875 from the vectorized reduction operation generated in the previous iteration. 3876 */ 3877 3878 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope) 3879 { 3880 single_defuse_cycle = true; 3881 epilog_copies = 1; 3882 } 3883 else 3884 epilog_copies = ncopies; 3885 3886 prev_stmt_info = NULL; 3887 prev_phi_info = NULL; 3888 for (j = 0; j < ncopies; j++) 3889 { 3890 if (j == 0 || !single_defuse_cycle) 3891 { 3892 /* Create the reduction-phi that defines the reduction-operand. */ 3893 new_phi = create_phi_node (vec_dest, loop->header); 3894 set_vinfo_for_stmt (new_phi, new_stmt_vec_info (new_phi, loop_vinfo, 3895 NULL)); 3896 /* Get the vector def for the reduction variable from the phi 3897 node. */ 3898 reduc_def = PHI_RESULT (new_phi); 3899 } 3900 3901 if (code == COND_EXPR) 3902 { 3903 first_phi = new_phi; 3904 vectorizable_condition (stmt, gsi, vec_stmt, reduc_def, reduc_index); 3905 /* Multiple types are not supported for condition. */ 3906 break; 3907 } 3908 3909 /* Handle uses. */ 3910 if (j == 0) 3911 { 3912 loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index], 3913 stmt, NULL); 3914 if (op_type == ternary_op) 3915 { 3916 if (reduc_index == 0) 3917 loop_vec_def1 = vect_get_vec_def_for_operand (ops[2], stmt, 3918 NULL); 3919 else 3920 loop_vec_def1 = vect_get_vec_def_for_operand (ops[1], stmt, 3921 NULL); 3922 } 3923 3924 /* Get the vector def for the reduction variable from the phi 3925 node. */ 3926 first_phi = new_phi; 3927 } 3928 else 3929 { 3930 enum vect_def_type dt = vect_unknown_def_type; /* Dummy */ 3931 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0); 3932 if (op_type == ternary_op) 3933 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def1); 3934 3935 if (single_defuse_cycle) 3936 reduc_def = gimple_assign_lhs (new_stmt); 3937 else 3938 reduc_def = PHI_RESULT (new_phi); 3939 3940 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi; 3941 } 3942 3943 /* Arguments are ready. Create the new vector stmt. */ 3944 if (op_type == binary_op) 3945 { 3946 if (reduc_index == 0) 3947 expr = build2 (code, vectype, reduc_def, loop_vec_def0); 3948 else 3949 expr = build2 (code, vectype, loop_vec_def0, reduc_def); 3950 } 3951 else 3952 { 3953 if (reduc_index == 0) 3954 expr = build3 (code, vectype, reduc_def, loop_vec_def0, 3955 loop_vec_def1); 3956 else 3957 { 3958 if (reduc_index == 1) 3959 expr = build3 (code, vectype, loop_vec_def0, reduc_def, 3960 loop_vec_def1); 3961 else 3962 expr = build3 (code, vectype, loop_vec_def0, loop_vec_def1, 3963 reduc_def); 3964 } 3965 } 3966 3967 new_stmt = gimple_build_assign (vec_dest, expr); 3968 new_temp = make_ssa_name (vec_dest, new_stmt); 3969 gimple_assign_set_lhs (new_stmt, new_temp); 3970 vect_finish_stmt_generation (stmt, new_stmt, gsi); 3971 3972 if (j == 0) 3973 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt; 3974 else 3975 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt; 3976 3977 prev_stmt_info = vinfo_for_stmt (new_stmt); 3978 prev_phi_info = vinfo_for_stmt (new_phi); 3979 } 3980 3981 /* Finalize the reduction-phi (set its arguments) and create the 3982 epilog reduction code. */ 3983 if (!single_defuse_cycle || code == COND_EXPR) 3984 new_temp = gimple_assign_lhs (*vec_stmt); 3985 3986 vect_create_epilog_for_reduction (new_temp, stmt, epilog_copies, 3987 epilog_reduc_code, first_phi, reduc_index, 3988 double_reduc); 3989 return true; 3990 } 3991 3992 /* Function vect_min_worthwhile_factor. 3993 3994 For a loop where we could vectorize the operation indicated by CODE, 3995 return the minimum vectorization factor that makes it worthwhile 3996 to use generic vectors. */ 3997 int 3998 vect_min_worthwhile_factor (enum tree_code code) 3999 { 4000 switch (code) 4001 { 4002 case PLUS_EXPR: 4003 case MINUS_EXPR: 4004 case NEGATE_EXPR: 4005 return 4; 4006 4007 case BIT_AND_EXPR: 4008 case BIT_IOR_EXPR: 4009 case BIT_XOR_EXPR: 4010 case BIT_NOT_EXPR: 4011 return 2; 4012 4013 default: 4014 return INT_MAX; 4015 } 4016 } 4017 4018 4019 /* Function vectorizable_induction 4020 4021 Check if PHI performs an induction computation that can be vectorized. 4022 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized 4023 phi to replace it, put it in VEC_STMT, and add it to the same basic block. 4024 Return FALSE if not a vectorizable STMT, TRUE otherwise. */ 4025 4026 bool 4027 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED, 4028 gimple *vec_stmt) 4029 { 4030 stmt_vec_info stmt_info = vinfo_for_stmt (phi); 4031 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 4032 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 4033 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 4034 int nunits = TYPE_VECTOR_SUBPARTS (vectype); 4035 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits; 4036 tree vec_def; 4037 4038 gcc_assert (ncopies >= 1); 4039 /* FORNOW. This restriction should be relaxed. */ 4040 if (nested_in_vect_loop_p (loop, phi) && ncopies > 1) 4041 { 4042 if (vect_print_dump_info (REPORT_DETAILS)) 4043 fprintf (vect_dump, "multiple types in nested loop."); 4044 return false; 4045 } 4046 4047 if (!STMT_VINFO_RELEVANT_P (stmt_info)) 4048 return false; 4049 4050 /* FORNOW: SLP not supported. */ 4051 if (STMT_SLP_TYPE (stmt_info)) 4052 return false; 4053 4054 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def); 4055 4056 if (gimple_code (phi) != GIMPLE_PHI) 4057 return false; 4058 4059 if (!vec_stmt) /* transformation not required. */ 4060 { 4061 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type; 4062 if (vect_print_dump_info (REPORT_DETAILS)) 4063 fprintf (vect_dump, "=== vectorizable_induction ==="); 4064 vect_model_induction_cost (stmt_info, ncopies); 4065 return true; 4066 } 4067 4068 /** Transform. **/ 4069 4070 if (vect_print_dump_info (REPORT_DETAILS)) 4071 fprintf (vect_dump, "transform induction phi."); 4072 4073 vec_def = get_initial_def_for_induction (phi); 4074 *vec_stmt = SSA_NAME_DEF_STMT (vec_def); 4075 return true; 4076 } 4077 4078 /* Function vectorizable_live_operation. 4079 4080 STMT computes a value that is used outside the loop. Check if 4081 it can be supported. */ 4082 4083 bool 4084 vectorizable_live_operation (gimple stmt, 4085 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED, 4086 gimple *vec_stmt ATTRIBUTE_UNUSED) 4087 { 4088 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 4089 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 4090 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 4091 int i; 4092 int op_type; 4093 tree op; 4094 tree def; 4095 gimple def_stmt; 4096 enum vect_def_type dt; 4097 enum tree_code code; 4098 enum gimple_rhs_class rhs_class; 4099 4100 gcc_assert (STMT_VINFO_LIVE_P (stmt_info)); 4101 4102 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def) 4103 return false; 4104 4105 if (!is_gimple_assign (stmt)) 4106 return false; 4107 4108 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME) 4109 return false; 4110 4111 /* FORNOW. CHECKME. */ 4112 if (nested_in_vect_loop_p (loop, stmt)) 4113 return false; 4114 4115 code = gimple_assign_rhs_code (stmt); 4116 op_type = TREE_CODE_LENGTH (code); 4117 rhs_class = get_gimple_rhs_class (code); 4118 gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op); 4119 gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op); 4120 4121 /* FORNOW: support only if all uses are invariant. This means 4122 that the scalar operations can remain in place, unvectorized. 4123 The original last scalar value that they compute will be used. */ 4124 4125 for (i = 0; i < op_type; i++) 4126 { 4127 if (rhs_class == GIMPLE_SINGLE_RHS) 4128 op = TREE_OPERAND (gimple_op (stmt, 1), i); 4129 else 4130 op = gimple_op (stmt, i + 1); 4131 if (op 4132 && !vect_is_simple_use (op, loop_vinfo, NULL, &def_stmt, &def, &dt)) 4133 { 4134 if (vect_print_dump_info (REPORT_DETAILS)) 4135 fprintf (vect_dump, "use not simple."); 4136 return false; 4137 } 4138 4139 if (dt != vect_external_def && dt != vect_constant_def) 4140 return false; 4141 } 4142 4143 /* No transformation is required for the cases we currently support. */ 4144 return true; 4145 } 4146 4147 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */ 4148 4149 static void 4150 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt) 4151 { 4152 ssa_op_iter op_iter; 4153 imm_use_iterator imm_iter; 4154 def_operand_p def_p; 4155 gimple ustmt; 4156 4157 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF) 4158 { 4159 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p)) 4160 { 4161 basic_block bb; 4162 4163 if (!is_gimple_debug (ustmt)) 4164 continue; 4165 4166 bb = gimple_bb (ustmt); 4167 4168 if (!flow_bb_inside_loop_p (loop, bb)) 4169 { 4170 if (gimple_debug_bind_p (ustmt)) 4171 { 4172 if (vect_print_dump_info (REPORT_DETAILS)) 4173 fprintf (vect_dump, "killing debug use"); 4174 4175 gimple_debug_bind_reset_value (ustmt); 4176 update_stmt (ustmt); 4177 } 4178 else 4179 gcc_unreachable (); 4180 } 4181 } 4182 } 4183 } 4184 4185 /* Function vect_transform_loop. 4186 4187 The analysis phase has determined that the loop is vectorizable. 4188 Vectorize the loop - created vectorized stmts to replace the scalar 4189 stmts in the loop, and update the loop exit condition. */ 4190 4191 void 4192 vect_transform_loop (loop_vec_info loop_vinfo) 4193 { 4194 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 4195 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); 4196 int nbbs = loop->num_nodes; 4197 gimple_stmt_iterator si; 4198 int i; 4199 tree ratio = NULL; 4200 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 4201 bool strided_store; 4202 bool slp_scheduled = false; 4203 unsigned int nunits; 4204 tree cond_expr = NULL_TREE; 4205 gimple_seq cond_expr_stmt_list = NULL; 4206 bool do_peeling_for_loop_bound; 4207 4208 if (vect_print_dump_info (REPORT_DETAILS)) 4209 fprintf (vect_dump, "=== vec_transform_loop ==="); 4210 4211 /* Peel the loop if there are data refs with unknown alignment. 4212 Only one data ref with unknown store is allowed. */ 4213 4214 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo)) 4215 vect_do_peeling_for_alignment (loop_vinfo); 4216 4217 do_peeling_for_loop_bound 4218 = (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) 4219 || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) 4220 && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0) 4221 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)); 4222 4223 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo) 4224 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)) 4225 vect_loop_versioning (loop_vinfo, 4226 !do_peeling_for_loop_bound, 4227 &cond_expr, &cond_expr_stmt_list); 4228 4229 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a 4230 compile time constant), or it is a constant that doesn't divide by the 4231 vectorization factor, then an epilog loop needs to be created. 4232 We therefore duplicate the loop: the original loop will be vectorized, 4233 and will compute the first (n/VF) iterations. The second copy of the loop 4234 will remain scalar and will compute the remaining (n%VF) iterations. 4235 (VF is the vectorization factor). */ 4236 4237 if (do_peeling_for_loop_bound) 4238 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio, 4239 cond_expr, cond_expr_stmt_list); 4240 else 4241 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)), 4242 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor); 4243 4244 /* 1) Make sure the loop header has exactly two entries 4245 2) Make sure we have a preheader basic block. */ 4246 4247 gcc_assert (EDGE_COUNT (loop->header->preds) == 2); 4248 4249 split_edge (loop_preheader_edge (loop)); 4250 4251 /* FORNOW: the vectorizer supports only loops which body consist 4252 of one basic block (header + empty latch). When the vectorizer will 4253 support more involved loop forms, the order by which the BBs are 4254 traversed need to be reconsidered. */ 4255 4256 for (i = 0; i < nbbs; i++) 4257 { 4258 basic_block bb = bbs[i]; 4259 stmt_vec_info stmt_info; 4260 gimple phi; 4261 4262 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si)) 4263 { 4264 phi = gsi_stmt (si); 4265 if (vect_print_dump_info (REPORT_DETAILS)) 4266 { 4267 fprintf (vect_dump, "------>vectorizing phi: "); 4268 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM); 4269 } 4270 stmt_info = vinfo_for_stmt (phi); 4271 if (!stmt_info) 4272 continue; 4273 4274 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info)) 4275 vect_loop_kill_debug_uses (loop, phi); 4276 4277 if (!STMT_VINFO_RELEVANT_P (stmt_info) 4278 && !STMT_VINFO_LIVE_P (stmt_info)) 4279 continue; 4280 4281 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)) 4282 != (unsigned HOST_WIDE_INT) vectorization_factor) 4283 && vect_print_dump_info (REPORT_DETAILS)) 4284 fprintf (vect_dump, "multiple-types."); 4285 4286 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def) 4287 { 4288 if (vect_print_dump_info (REPORT_DETAILS)) 4289 fprintf (vect_dump, "transform phi."); 4290 vect_transform_stmt (phi, NULL, NULL, NULL, NULL); 4291 } 4292 } 4293 4294 for (si = gsi_start_bb (bb); !gsi_end_p (si);) 4295 { 4296 gimple stmt = gsi_stmt (si); 4297 bool is_store; 4298 4299 if (vect_print_dump_info (REPORT_DETAILS)) 4300 { 4301 fprintf (vect_dump, "------>vectorizing statement: "); 4302 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); 4303 } 4304 4305 stmt_info = vinfo_for_stmt (stmt); 4306 4307 /* vector stmts created in the outer-loop during vectorization of 4308 stmts in an inner-loop may not have a stmt_info, and do not 4309 need to be vectorized. */ 4310 if (!stmt_info) 4311 { 4312 gsi_next (&si); 4313 continue; 4314 } 4315 4316 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info)) 4317 vect_loop_kill_debug_uses (loop, stmt); 4318 4319 if (!STMT_VINFO_RELEVANT_P (stmt_info) 4320 && !STMT_VINFO_LIVE_P (stmt_info)) 4321 { 4322 gsi_next (&si); 4323 continue; 4324 } 4325 4326 gcc_assert (STMT_VINFO_VECTYPE (stmt_info)); 4327 nunits = 4328 (unsigned int) TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)); 4329 if (!STMT_SLP_TYPE (stmt_info) 4330 && nunits != (unsigned int) vectorization_factor 4331 && vect_print_dump_info (REPORT_DETAILS)) 4332 /* For SLP VF is set according to unrolling factor, and not to 4333 vector size, hence for SLP this print is not valid. */ 4334 fprintf (vect_dump, "multiple-types."); 4335 4336 /* SLP. Schedule all the SLP instances when the first SLP stmt is 4337 reached. */ 4338 if (STMT_SLP_TYPE (stmt_info)) 4339 { 4340 if (!slp_scheduled) 4341 { 4342 slp_scheduled = true; 4343 4344 if (vect_print_dump_info (REPORT_DETAILS)) 4345 fprintf (vect_dump, "=== scheduling SLP instances ==="); 4346 4347 vect_schedule_slp (loop_vinfo, NULL); 4348 } 4349 4350 /* Hybrid SLP stmts must be vectorized in addition to SLP. */ 4351 if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info)) 4352 { 4353 gsi_next (&si); 4354 continue; 4355 } 4356 } 4357 4358 /* -------- vectorize statement ------------ */ 4359 if (vect_print_dump_info (REPORT_DETAILS)) 4360 fprintf (vect_dump, "transform statement."); 4361 4362 strided_store = false; 4363 is_store = vect_transform_stmt (stmt, &si, &strided_store, NULL, NULL); 4364 if (is_store) 4365 { 4366 if (STMT_VINFO_STRIDED_ACCESS (stmt_info)) 4367 { 4368 /* Interleaving. If IS_STORE is TRUE, the vectorization of the 4369 interleaving chain was completed - free all the stores in 4370 the chain. */ 4371 vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info)); 4372 gsi_remove (&si, true); 4373 continue; 4374 } 4375 else 4376 { 4377 /* Free the attached stmt_vec_info and remove the stmt. */ 4378 free_stmt_vec_info (stmt); 4379 gsi_remove (&si, true); 4380 continue; 4381 } 4382 } 4383 gsi_next (&si); 4384 } /* stmts in BB */ 4385 } /* BBs in loop */ 4386 4387 slpeel_make_loop_iterate_ntimes (loop, ratio); 4388 4389 /* The memory tags and pointers in vectorized statements need to 4390 have their SSA forms updated. FIXME, why can't this be delayed 4391 until all the loops have been transformed? */ 4392 update_ssa (TODO_update_ssa); 4393 4394 if (vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS)) 4395 fprintf (vect_dump, "LOOP VECTORIZED."); 4396 if (loop->inner && vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS)) 4397 fprintf (vect_dump, "OUTER LOOP VECTORIZED."); 4398 } 4399