1 /* Data References Analysis and Manipulation Utilities for Vectorization. 2 Copyright (C) 2003-2015 Free Software Foundation, Inc. 3 Contributed by Dorit Naishlos <dorit@il.ibm.com> 4 and Ira Rosen <irar@il.ibm.com> 5 6 This file is part of GCC. 7 8 GCC is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free 10 Software Foundation; either version 3, or (at your option) any later 11 version. 12 13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or 15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 16 for more details. 17 18 You should have received a copy of the GNU General Public License 19 along with GCC; see the file COPYING3. If not see 20 <http://www.gnu.org/licenses/>. */ 21 22 #include "config.h" 23 #include "system.h" 24 #include "coretypes.h" 25 #include "dumpfile.h" 26 #include "tm.h" 27 #include "hash-set.h" 28 #include "machmode.h" 29 #include "vec.h" 30 #include "double-int.h" 31 #include "input.h" 32 #include "alias.h" 33 #include "symtab.h" 34 #include "wide-int.h" 35 #include "inchash.h" 36 #include "tree.h" 37 #include "fold-const.h" 38 #include "stor-layout.h" 39 #include "tm_p.h" 40 #include "target.h" 41 #include "predict.h" 42 #include "hard-reg-set.h" 43 #include "function.h" 44 #include "dominance.h" 45 #include "cfg.h" 46 #include "basic-block.h" 47 #include "gimple-pretty-print.h" 48 #include "tree-ssa-alias.h" 49 #include "internal-fn.h" 50 #include "tree-eh.h" 51 #include "gimple-expr.h" 52 #include "is-a.h" 53 #include "gimple.h" 54 #include "gimplify.h" 55 #include "gimple-iterator.h" 56 #include "gimplify-me.h" 57 #include "gimple-ssa.h" 58 #include "tree-phinodes.h" 59 #include "ssa-iterators.h" 60 #include "stringpool.h" 61 #include "tree-ssanames.h" 62 #include "tree-ssa-loop-ivopts.h" 63 #include "tree-ssa-loop-manip.h" 64 #include "tree-ssa-loop.h" 65 #include "cfgloop.h" 66 #include "tree-chrec.h" 67 #include "tree-scalar-evolution.h" 68 #include "tree-vectorizer.h" 69 #include "diagnostic-core.h" 70 #include "hash-map.h" 71 #include "plugin-api.h" 72 #include "ipa-ref.h" 73 #include "cgraph.h" 74 /* Need to include rtl.h, expr.h, etc. for optabs. */ 75 #include "hashtab.h" 76 #include "rtl.h" 77 #include "flags.h" 78 #include "statistics.h" 79 #include "real.h" 80 #include "fixed-value.h" 81 #include "insn-config.h" 82 #include "expmed.h" 83 #include "dojump.h" 84 #include "explow.h" 85 #include "calls.h" 86 #include "emit-rtl.h" 87 #include "varasm.h" 88 #include "stmt.h" 89 #include "expr.h" 90 #include "insn-codes.h" 91 #include "optabs.h" 92 #include "builtins.h" 93 94 /* Return true if load- or store-lanes optab OPTAB is implemented for 95 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */ 96 97 static bool 98 vect_lanes_optab_supported_p (const char *name, convert_optab optab, 99 tree vectype, unsigned HOST_WIDE_INT count) 100 { 101 machine_mode mode, array_mode; 102 bool limit_p; 103 104 mode = TYPE_MODE (vectype); 105 limit_p = !targetm.array_mode_supported_p (mode, count); 106 array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode), 107 MODE_INT, limit_p); 108 109 if (array_mode == BLKmode) 110 { 111 if (dump_enabled_p ()) 112 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 113 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n", 114 GET_MODE_NAME (mode), count); 115 return false; 116 } 117 118 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing) 119 { 120 if (dump_enabled_p ()) 121 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 122 "cannot use %s<%s><%s>\n", name, 123 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode)); 124 return false; 125 } 126 127 if (dump_enabled_p ()) 128 dump_printf_loc (MSG_NOTE, vect_location, 129 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode), 130 GET_MODE_NAME (mode)); 131 132 return true; 133 } 134 135 136 /* Return the smallest scalar part of STMT. 137 This is used to determine the vectype of the stmt. We generally set the 138 vectype according to the type of the result (lhs). For stmts whose 139 result-type is different than the type of the arguments (e.g., demotion, 140 promotion), vectype will be reset appropriately (later). Note that we have 141 to visit the smallest datatype in this function, because that determines the 142 VF. If the smallest datatype in the loop is present only as the rhs of a 143 promotion operation - we'd miss it. 144 Such a case, where a variable of this datatype does not appear in the lhs 145 anywhere in the loop, can only occur if it's an invariant: e.g.: 146 'int_x = (int) short_inv', which we'd expect to have been optimized away by 147 invariant motion. However, we cannot rely on invariant motion to always 148 take invariants out of the loop, and so in the case of promotion we also 149 have to check the rhs. 150 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding 151 types. */ 152 153 tree 154 vect_get_smallest_scalar_type (gimple stmt, HOST_WIDE_INT *lhs_size_unit, 155 HOST_WIDE_INT *rhs_size_unit) 156 { 157 tree scalar_type = gimple_expr_type (stmt); 158 HOST_WIDE_INT lhs, rhs; 159 160 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type)); 161 162 if (is_gimple_assign (stmt) 163 && (gimple_assign_cast_p (stmt) 164 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR 165 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR 166 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR)) 167 { 168 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt)); 169 170 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type)); 171 if (rhs < lhs) 172 scalar_type = rhs_type; 173 } 174 175 *lhs_size_unit = lhs; 176 *rhs_size_unit = rhs; 177 return scalar_type; 178 } 179 180 181 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be 182 tested at run-time. Return TRUE if DDR was successfully inserted. 183 Return false if versioning is not supported. */ 184 185 static bool 186 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo) 187 { 188 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 189 190 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0) 191 return false; 192 193 if (dump_enabled_p ()) 194 { 195 dump_printf_loc (MSG_NOTE, vect_location, 196 "mark for run-time aliasing test between "); 197 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr))); 198 dump_printf (MSG_NOTE, " and "); 199 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr))); 200 dump_printf (MSG_NOTE, "\n"); 201 } 202 203 if (optimize_loop_nest_for_size_p (loop)) 204 { 205 if (dump_enabled_p ()) 206 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 207 "versioning not supported when optimizing" 208 " for size.\n"); 209 return false; 210 } 211 212 /* FORNOW: We don't support versioning with outer-loop vectorization. */ 213 if (loop->inner) 214 { 215 if (dump_enabled_p ()) 216 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 217 "versioning not yet supported for outer-loops.\n"); 218 return false; 219 } 220 221 /* FORNOW: We don't support creating runtime alias tests for non-constant 222 step. */ 223 if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST 224 || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST) 225 { 226 if (dump_enabled_p ()) 227 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 228 "versioning not yet supported for non-constant " 229 "step\n"); 230 return false; 231 } 232 233 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr); 234 return true; 235 } 236 237 238 /* Function vect_analyze_data_ref_dependence. 239 240 Return TRUE if there (might) exist a dependence between a memory-reference 241 DRA and a memory-reference DRB. When versioning for alias may check a 242 dependence at run-time, return FALSE. Adjust *MAX_VF according to 243 the data dependence. */ 244 245 static bool 246 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr, 247 loop_vec_info loop_vinfo, int *max_vf) 248 { 249 unsigned int i; 250 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 251 struct data_reference *dra = DDR_A (ddr); 252 struct data_reference *drb = DDR_B (ddr); 253 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra)); 254 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb)); 255 lambda_vector dist_v; 256 unsigned int loop_depth; 257 258 /* In loop analysis all data references should be vectorizable. */ 259 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a) 260 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b)) 261 gcc_unreachable (); 262 263 /* Independent data accesses. */ 264 if (DDR_ARE_DEPENDENT (ddr) == chrec_known) 265 return false; 266 267 if (dra == drb 268 || (DR_IS_READ (dra) && DR_IS_READ (drb))) 269 return false; 270 271 /* Even if we have an anti-dependence then, as the vectorized loop covers at 272 least two scalar iterations, there is always also a true dependence. 273 As the vectorizer does not re-order loads and stores we can ignore 274 the anti-dependence if TBAA can disambiguate both DRs similar to the 275 case with known negative distance anti-dependences (positive 276 distance anti-dependences would violate TBAA constraints). */ 277 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb)) 278 || (DR_IS_WRITE (dra) && DR_IS_READ (drb))) 279 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)), 280 get_alias_set (DR_REF (drb)))) 281 return false; 282 283 /* Unknown data dependence. */ 284 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know) 285 { 286 /* If user asserted safelen consecutive iterations can be 287 executed concurrently, assume independence. */ 288 if (loop->safelen >= 2) 289 { 290 if (loop->safelen < *max_vf) 291 *max_vf = loop->safelen; 292 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false; 293 return false; 294 } 295 296 if (STMT_VINFO_GATHER_P (stmtinfo_a) 297 || STMT_VINFO_GATHER_P (stmtinfo_b)) 298 { 299 if (dump_enabled_p ()) 300 { 301 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 302 "versioning for alias not supported for: " 303 "can't determine dependence between "); 304 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 305 DR_REF (dra)); 306 dump_printf (MSG_MISSED_OPTIMIZATION, " and "); 307 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 308 DR_REF (drb)); 309 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 310 } 311 return true; 312 } 313 314 if (dump_enabled_p ()) 315 { 316 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 317 "versioning for alias required: " 318 "can't determine dependence between "); 319 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 320 DR_REF (dra)); 321 dump_printf (MSG_MISSED_OPTIMIZATION, " and "); 322 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 323 DR_REF (drb)); 324 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 325 } 326 327 /* Add to list of ddrs that need to be tested at run-time. */ 328 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo); 329 } 330 331 /* Known data dependence. */ 332 if (DDR_NUM_DIST_VECTS (ddr) == 0) 333 { 334 /* If user asserted safelen consecutive iterations can be 335 executed concurrently, assume independence. */ 336 if (loop->safelen >= 2) 337 { 338 if (loop->safelen < *max_vf) 339 *max_vf = loop->safelen; 340 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false; 341 return false; 342 } 343 344 if (STMT_VINFO_GATHER_P (stmtinfo_a) 345 || STMT_VINFO_GATHER_P (stmtinfo_b)) 346 { 347 if (dump_enabled_p ()) 348 { 349 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 350 "versioning for alias not supported for: " 351 "bad dist vector for "); 352 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 353 DR_REF (dra)); 354 dump_printf (MSG_MISSED_OPTIMIZATION, " and "); 355 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 356 DR_REF (drb)); 357 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 358 } 359 return true; 360 } 361 362 if (dump_enabled_p ()) 363 { 364 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 365 "versioning for alias required: " 366 "bad dist vector for "); 367 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra)); 368 dump_printf (MSG_MISSED_OPTIMIZATION, " and "); 369 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb)); 370 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 371 } 372 /* Add to list of ddrs that need to be tested at run-time. */ 373 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo); 374 } 375 376 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr)); 377 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v) 378 { 379 int dist = dist_v[loop_depth]; 380 381 if (dump_enabled_p ()) 382 dump_printf_loc (MSG_NOTE, vect_location, 383 "dependence distance = %d.\n", dist); 384 385 if (dist == 0) 386 { 387 if (dump_enabled_p ()) 388 { 389 dump_printf_loc (MSG_NOTE, vect_location, 390 "dependence distance == 0 between "); 391 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra)); 392 dump_printf (MSG_NOTE, " and "); 393 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb)); 394 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 395 } 396 397 /* When we perform grouped accesses and perform implicit CSE 398 by detecting equal accesses and doing disambiguation with 399 runtime alias tests like for 400 .. = a[i]; 401 .. = a[i+1]; 402 a[i] = ..; 403 a[i+1] = ..; 404 *p = ..; 405 .. = a[i]; 406 .. = a[i+1]; 407 where we will end up loading { a[i], a[i+1] } once, make 408 sure that inserting group loads before the first load and 409 stores after the last store will do the right thing. 410 Similar for groups like 411 a[i] = ...; 412 ... = a[i]; 413 a[i+1] = ...; 414 where loads from the group interleave with the store. */ 415 if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a) 416 || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b)) 417 { 418 gimple earlier_stmt; 419 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb)); 420 if (DR_IS_WRITE 421 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt)))) 422 { 423 if (dump_enabled_p ()) 424 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 425 "READ_WRITE dependence in interleaving." 426 "\n"); 427 return true; 428 } 429 } 430 431 continue; 432 } 433 434 if (dist > 0 && DDR_REVERSED_P (ddr)) 435 { 436 /* If DDR_REVERSED_P the order of the data-refs in DDR was 437 reversed (to make distance vector positive), and the actual 438 distance is negative. */ 439 if (dump_enabled_p ()) 440 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 441 "dependence distance negative.\n"); 442 /* Record a negative dependence distance to later limit the 443 amount of stmt copying / unrolling we can perform. 444 Only need to handle read-after-write dependence. */ 445 if (DR_IS_READ (drb) 446 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0 447 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist)) 448 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist; 449 continue; 450 } 451 452 if (abs (dist) >= 2 453 && abs (dist) < *max_vf) 454 { 455 /* The dependence distance requires reduction of the maximal 456 vectorization factor. */ 457 *max_vf = abs (dist); 458 if (dump_enabled_p ()) 459 dump_printf_loc (MSG_NOTE, vect_location, 460 "adjusting maximal vectorization factor to %i\n", 461 *max_vf); 462 } 463 464 if (abs (dist) >= *max_vf) 465 { 466 /* Dependence distance does not create dependence, as far as 467 vectorization is concerned, in this case. */ 468 if (dump_enabled_p ()) 469 dump_printf_loc (MSG_NOTE, vect_location, 470 "dependence distance >= VF.\n"); 471 continue; 472 } 473 474 if (dump_enabled_p ()) 475 { 476 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 477 "not vectorized, possible dependence " 478 "between data-refs "); 479 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra)); 480 dump_printf (MSG_NOTE, " and "); 481 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb)); 482 dump_printf (MSG_NOTE, "\n"); 483 } 484 485 return true; 486 } 487 488 return false; 489 } 490 491 /* Function vect_analyze_data_ref_dependences. 492 493 Examine all the data references in the loop, and make sure there do not 494 exist any data dependences between them. Set *MAX_VF according to 495 the maximum vectorization factor the data dependences allow. */ 496 497 bool 498 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf) 499 { 500 unsigned int i; 501 struct data_dependence_relation *ddr; 502 503 if (dump_enabled_p ()) 504 dump_printf_loc (MSG_NOTE, vect_location, 505 "=== vect_analyze_data_ref_dependences ===\n"); 506 507 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true; 508 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo), 509 &LOOP_VINFO_DDRS (loop_vinfo), 510 LOOP_VINFO_LOOP_NEST (loop_vinfo), true)) 511 return false; 512 513 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr) 514 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf)) 515 return false; 516 517 return true; 518 } 519 520 521 /* Function vect_slp_analyze_data_ref_dependence. 522 523 Return TRUE if there (might) exist a dependence between a memory-reference 524 DRA and a memory-reference DRB. When versioning for alias may check a 525 dependence at run-time, return FALSE. Adjust *MAX_VF according to 526 the data dependence. */ 527 528 static bool 529 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr) 530 { 531 struct data_reference *dra = DDR_A (ddr); 532 struct data_reference *drb = DDR_B (ddr); 533 534 /* We need to check dependences of statements marked as unvectorizable 535 as well, they still can prohibit vectorization. */ 536 537 /* Independent data accesses. */ 538 if (DDR_ARE_DEPENDENT (ddr) == chrec_known) 539 return false; 540 541 if (dra == drb) 542 return false; 543 544 /* Read-read is OK. */ 545 if (DR_IS_READ (dra) && DR_IS_READ (drb)) 546 return false; 547 548 /* If dra and drb are part of the same interleaving chain consider 549 them independent. */ 550 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra))) 551 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra))) 552 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb))))) 553 return false; 554 555 /* Unknown data dependence. */ 556 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know) 557 { 558 if (dump_enabled_p ()) 559 { 560 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 561 "can't determine dependence between "); 562 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra)); 563 dump_printf (MSG_MISSED_OPTIMIZATION, " and "); 564 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb)); 565 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 566 } 567 } 568 else if (dump_enabled_p ()) 569 { 570 dump_printf_loc (MSG_NOTE, vect_location, 571 "determined dependence between "); 572 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra)); 573 dump_printf (MSG_NOTE, " and "); 574 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb)); 575 dump_printf (MSG_NOTE, "\n"); 576 } 577 578 /* We do not vectorize basic blocks with write-write dependencies. */ 579 if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb)) 580 return true; 581 582 /* If we have a read-write dependence check that the load is before the store. 583 When we vectorize basic blocks, vector load can be only before 584 corresponding scalar load, and vector store can be only after its 585 corresponding scalar store. So the order of the acceses is preserved in 586 case the load is before the store. */ 587 gimple earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb)); 588 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt)))) 589 { 590 /* That only holds for load-store pairs taking part in vectorization. */ 591 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dra))) 592 && STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (drb)))) 593 return false; 594 } 595 596 return true; 597 } 598 599 600 /* Function vect_analyze_data_ref_dependences. 601 602 Examine all the data references in the basic-block, and make sure there 603 do not exist any data dependences between them. Set *MAX_VF according to 604 the maximum vectorization factor the data dependences allow. */ 605 606 bool 607 vect_slp_analyze_data_ref_dependences (bb_vec_info bb_vinfo) 608 { 609 struct data_dependence_relation *ddr; 610 unsigned int i; 611 612 if (dump_enabled_p ()) 613 dump_printf_loc (MSG_NOTE, vect_location, 614 "=== vect_slp_analyze_data_ref_dependences ===\n"); 615 616 if (!compute_all_dependences (BB_VINFO_DATAREFS (bb_vinfo), 617 &BB_VINFO_DDRS (bb_vinfo), 618 vNULL, true)) 619 return false; 620 621 FOR_EACH_VEC_ELT (BB_VINFO_DDRS (bb_vinfo), i, ddr) 622 if (vect_slp_analyze_data_ref_dependence (ddr)) 623 return false; 624 625 return true; 626 } 627 628 629 /* Function vect_compute_data_ref_alignment 630 631 Compute the misalignment of the data reference DR. 632 633 Output: 634 1. If during the misalignment computation it is found that the data reference 635 cannot be vectorized then false is returned. 636 2. DR_MISALIGNMENT (DR) is defined. 637 638 FOR NOW: No analysis is actually performed. Misalignment is calculated 639 only for trivial cases. TODO. */ 640 641 static bool 642 vect_compute_data_ref_alignment (struct data_reference *dr) 643 { 644 gimple stmt = DR_STMT (dr); 645 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 646 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 647 struct loop *loop = NULL; 648 tree ref = DR_REF (dr); 649 tree vectype; 650 tree base, base_addr; 651 tree misalign; 652 tree aligned_to; 653 unsigned HOST_WIDE_INT alignment; 654 655 if (dump_enabled_p ()) 656 dump_printf_loc (MSG_NOTE, vect_location, 657 "vect_compute_data_ref_alignment:\n"); 658 659 if (loop_vinfo) 660 loop = LOOP_VINFO_LOOP (loop_vinfo); 661 662 /* Initialize misalignment to unknown. */ 663 SET_DR_MISALIGNMENT (dr, -1); 664 665 /* Strided loads perform only component accesses, misalignment information 666 is irrelevant for them. */ 667 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info)) 668 return true; 669 670 misalign = DR_INIT (dr); 671 aligned_to = DR_ALIGNED_TO (dr); 672 base_addr = DR_BASE_ADDRESS (dr); 673 vectype = STMT_VINFO_VECTYPE (stmt_info); 674 675 /* In case the dataref is in an inner-loop of the loop that is being 676 vectorized (LOOP), we use the base and misalignment information 677 relative to the outer-loop (LOOP). This is ok only if the misalignment 678 stays the same throughout the execution of the inner-loop, which is why 679 we have to check that the stride of the dataref in the inner-loop evenly 680 divides by the vector size. */ 681 if (loop && nested_in_vect_loop_p (loop, stmt)) 682 { 683 tree step = DR_STEP (dr); 684 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step); 685 686 if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0) 687 { 688 if (dump_enabled_p ()) 689 dump_printf_loc (MSG_NOTE, vect_location, 690 "inner step divides the vector-size.\n"); 691 misalign = STMT_VINFO_DR_INIT (stmt_info); 692 aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info); 693 base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info); 694 } 695 else 696 { 697 if (dump_enabled_p ()) 698 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 699 "inner step doesn't divide the vector-size.\n"); 700 misalign = NULL_TREE; 701 } 702 } 703 704 /* Similarly, if we're doing basic-block vectorization, we can only use 705 base and misalignment information relative to an innermost loop if the 706 misalignment stays the same throughout the execution of the loop. 707 As above, this is the case if the stride of the dataref evenly divides 708 by the vector size. */ 709 if (!loop) 710 { 711 tree step = DR_STEP (dr); 712 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step); 713 714 if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0) 715 { 716 if (dump_enabled_p ()) 717 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 718 "SLP: step doesn't divide the vector-size.\n"); 719 misalign = NULL_TREE; 720 } 721 } 722 723 /* To look at alignment of the base we have to preserve an inner MEM_REF 724 as that carries alignment information of the actual access. */ 725 base = ref; 726 while (handled_component_p (base)) 727 base = TREE_OPERAND (base, 0); 728 if (TREE_CODE (base) == MEM_REF) 729 base = build2 (MEM_REF, TREE_TYPE (base), base_addr, 730 build_int_cst (TREE_TYPE (TREE_OPERAND (base, 1)), 0)); 731 unsigned int base_alignment = get_object_alignment (base); 732 733 if (base_alignment >= TYPE_ALIGN (TREE_TYPE (vectype))) 734 DR_VECT_AUX (dr)->base_element_aligned = true; 735 736 alignment = TYPE_ALIGN_UNIT (vectype); 737 738 if ((compare_tree_int (aligned_to, alignment) < 0) 739 || !misalign) 740 { 741 if (dump_enabled_p ()) 742 { 743 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 744 "Unknown alignment for access: "); 745 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref); 746 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 747 } 748 return true; 749 } 750 751 if (base_alignment < TYPE_ALIGN (vectype)) 752 { 753 /* Strip an inner MEM_REF to a bare decl if possible. */ 754 if (TREE_CODE (base) == MEM_REF 755 && integer_zerop (TREE_OPERAND (base, 1)) 756 && TREE_CODE (TREE_OPERAND (base, 0)) == ADDR_EXPR) 757 base = TREE_OPERAND (TREE_OPERAND (base, 0), 0); 758 759 if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype))) 760 { 761 if (dump_enabled_p ()) 762 { 763 dump_printf_loc (MSG_NOTE, vect_location, 764 "can't force alignment of ref: "); 765 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref); 766 dump_printf (MSG_NOTE, "\n"); 767 } 768 return true; 769 } 770 771 /* Force the alignment of the decl. 772 NOTE: This is the only change to the code we make during 773 the analysis phase, before deciding to vectorize the loop. */ 774 if (dump_enabled_p ()) 775 { 776 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of "); 777 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref); 778 dump_printf (MSG_NOTE, "\n"); 779 } 780 781 DR_VECT_AUX (dr)->base_decl = base; 782 DR_VECT_AUX (dr)->base_misaligned = true; 783 DR_VECT_AUX (dr)->base_element_aligned = true; 784 } 785 786 /* If this is a backward running DR then first access in the larger 787 vectype actually is N-1 elements before the address in the DR. 788 Adjust misalign accordingly. */ 789 if (tree_int_cst_sgn (DR_STEP (dr)) < 0) 790 { 791 tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1); 792 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type, 793 otherwise we wouldn't be here. */ 794 offset = fold_build2 (MULT_EXPR, ssizetype, offset, DR_STEP (dr)); 795 /* PLUS because DR_STEP was negative. */ 796 misalign = size_binop (PLUS_EXPR, misalign, offset); 797 } 798 799 SET_DR_MISALIGNMENT (dr, 800 wi::mod_floor (misalign, alignment, SIGNED).to_uhwi ()); 801 802 if (dump_enabled_p ()) 803 { 804 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 805 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr)); 806 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref); 807 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 808 } 809 810 return true; 811 } 812 813 814 /* Function vect_compute_data_refs_alignment 815 816 Compute the misalignment of data references in the loop. 817 Return FALSE if a data reference is found that cannot be vectorized. */ 818 819 static bool 820 vect_compute_data_refs_alignment (loop_vec_info loop_vinfo, 821 bb_vec_info bb_vinfo) 822 { 823 vec<data_reference_p> datarefs; 824 struct data_reference *dr; 825 unsigned int i; 826 827 if (loop_vinfo) 828 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo); 829 else 830 datarefs = BB_VINFO_DATAREFS (bb_vinfo); 831 832 FOR_EACH_VEC_ELT (datarefs, i, dr) 833 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) 834 && !vect_compute_data_ref_alignment (dr)) 835 { 836 if (bb_vinfo) 837 { 838 /* Mark unsupported statement as unvectorizable. */ 839 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false; 840 continue; 841 } 842 else 843 return false; 844 } 845 846 return true; 847 } 848 849 850 /* Function vect_update_misalignment_for_peel 851 852 DR - the data reference whose misalignment is to be adjusted. 853 DR_PEEL - the data reference whose misalignment is being made 854 zero in the vector loop by the peel. 855 NPEEL - the number of iterations in the peel loop if the misalignment 856 of DR_PEEL is known at compile time. */ 857 858 static void 859 vect_update_misalignment_for_peel (struct data_reference *dr, 860 struct data_reference *dr_peel, int npeel) 861 { 862 unsigned int i; 863 vec<dr_p> same_align_drs; 864 struct data_reference *current_dr; 865 int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr)))); 866 int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel)))); 867 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr)); 868 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel)); 869 870 /* For interleaved data accesses the step in the loop must be multiplied by 871 the size of the interleaving group. */ 872 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) 873 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info))); 874 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info)) 875 dr_peel_size *= GROUP_SIZE (peel_stmt_info); 876 877 /* It can be assumed that the data refs with the same alignment as dr_peel 878 are aligned in the vector loop. */ 879 same_align_drs 880 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel))); 881 FOR_EACH_VEC_ELT (same_align_drs, i, current_dr) 882 { 883 if (current_dr != dr) 884 continue; 885 gcc_assert (DR_MISALIGNMENT (dr) / dr_size == 886 DR_MISALIGNMENT (dr_peel) / dr_peel_size); 887 SET_DR_MISALIGNMENT (dr, 0); 888 return; 889 } 890 891 if (known_alignment_for_access_p (dr) 892 && known_alignment_for_access_p (dr_peel)) 893 { 894 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0; 895 int misal = DR_MISALIGNMENT (dr); 896 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 897 misal += negative ? -npeel * dr_size : npeel * dr_size; 898 misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1; 899 SET_DR_MISALIGNMENT (dr, misal); 900 return; 901 } 902 903 if (dump_enabled_p ()) 904 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n"); 905 SET_DR_MISALIGNMENT (dr, -1); 906 } 907 908 909 /* Function vect_verify_datarefs_alignment 910 911 Return TRUE if all data references in the loop can be 912 handled with respect to alignment. */ 913 914 bool 915 vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo) 916 { 917 vec<data_reference_p> datarefs; 918 struct data_reference *dr; 919 enum dr_alignment_support supportable_dr_alignment; 920 unsigned int i; 921 922 if (loop_vinfo) 923 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo); 924 else 925 datarefs = BB_VINFO_DATAREFS (bb_vinfo); 926 927 FOR_EACH_VEC_ELT (datarefs, i, dr) 928 { 929 gimple stmt = DR_STMT (dr); 930 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 931 932 if (!STMT_VINFO_RELEVANT_P (stmt_info)) 933 continue; 934 935 /* For interleaving, only the alignment of the first access matters. 936 Skip statements marked as not vectorizable. */ 937 if ((STMT_VINFO_GROUPED_ACCESS (stmt_info) 938 && GROUP_FIRST_ELEMENT (stmt_info) != stmt) 939 || !STMT_VINFO_VECTORIZABLE (stmt_info)) 940 continue; 941 942 /* Strided loads perform only component accesses, alignment is 943 irrelevant for them. */ 944 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info)) 945 continue; 946 947 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false); 948 if (!supportable_dr_alignment) 949 { 950 if (dump_enabled_p ()) 951 { 952 if (DR_IS_READ (dr)) 953 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 954 "not vectorized: unsupported unaligned load."); 955 else 956 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 957 "not vectorized: unsupported unaligned " 958 "store."); 959 960 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 961 DR_REF (dr)); 962 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 963 } 964 return false; 965 } 966 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ()) 967 dump_printf_loc (MSG_NOTE, vect_location, 968 "Vectorizing an unaligned access.\n"); 969 } 970 return true; 971 } 972 973 /* Given an memory reference EXP return whether its alignment is less 974 than its size. */ 975 976 static bool 977 not_size_aligned (tree exp) 978 { 979 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp)))) 980 return true; 981 982 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp))) 983 > get_object_alignment (exp)); 984 } 985 986 /* Function vector_alignment_reachable_p 987 988 Return true if vector alignment for DR is reachable by peeling 989 a few loop iterations. Return false otherwise. */ 990 991 static bool 992 vector_alignment_reachable_p (struct data_reference *dr) 993 { 994 gimple stmt = DR_STMT (dr); 995 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 996 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 997 998 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) 999 { 1000 /* For interleaved access we peel only if number of iterations in 1001 the prolog loop ({VF - misalignment}), is a multiple of the 1002 number of the interleaved accesses. */ 1003 int elem_size, mis_in_elements; 1004 int nelements = TYPE_VECTOR_SUBPARTS (vectype); 1005 1006 /* FORNOW: handle only known alignment. */ 1007 if (!known_alignment_for_access_p (dr)) 1008 return false; 1009 1010 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements; 1011 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size; 1012 1013 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info)) 1014 return false; 1015 } 1016 1017 /* If misalignment is known at the compile time then allow peeling 1018 only if natural alignment is reachable through peeling. */ 1019 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr)) 1020 { 1021 HOST_WIDE_INT elmsize = 1022 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype))); 1023 if (dump_enabled_p ()) 1024 { 1025 dump_printf_loc (MSG_NOTE, vect_location, 1026 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize); 1027 dump_printf (MSG_NOTE, 1028 ". misalignment = %d.\n", DR_MISALIGNMENT (dr)); 1029 } 1030 if (DR_MISALIGNMENT (dr) % elmsize) 1031 { 1032 if (dump_enabled_p ()) 1033 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1034 "data size does not divide the misalignment.\n"); 1035 return false; 1036 } 1037 } 1038 1039 if (!known_alignment_for_access_p (dr)) 1040 { 1041 tree type = TREE_TYPE (DR_REF (dr)); 1042 bool is_packed = not_size_aligned (DR_REF (dr)); 1043 if (dump_enabled_p ()) 1044 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1045 "Unknown misalignment, is_packed = %d\n",is_packed); 1046 if ((TYPE_USER_ALIGN (type) && !is_packed) 1047 || targetm.vectorize.vector_alignment_reachable (type, is_packed)) 1048 return true; 1049 else 1050 return false; 1051 } 1052 1053 return true; 1054 } 1055 1056 1057 /* Calculate the cost of the memory access represented by DR. */ 1058 1059 static void 1060 vect_get_data_access_cost (struct data_reference *dr, 1061 unsigned int *inside_cost, 1062 unsigned int *outside_cost, 1063 stmt_vector_for_cost *body_cost_vec) 1064 { 1065 gimple stmt = DR_STMT (dr); 1066 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 1067 int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)); 1068 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 1069 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 1070 int ncopies = vf / nunits; 1071 1072 if (DR_IS_READ (dr)) 1073 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost, 1074 NULL, body_cost_vec, false); 1075 else 1076 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec); 1077 1078 if (dump_enabled_p ()) 1079 dump_printf_loc (MSG_NOTE, vect_location, 1080 "vect_get_data_access_cost: inside_cost = %d, " 1081 "outside_cost = %d.\n", *inside_cost, *outside_cost); 1082 } 1083 1084 1085 /* Insert DR into peeling hash table with NPEEL as key. */ 1086 1087 static void 1088 vect_peeling_hash_insert (loop_vec_info loop_vinfo, struct data_reference *dr, 1089 int npeel) 1090 { 1091 struct _vect_peel_info elem, *slot; 1092 _vect_peel_info **new_slot; 1093 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true); 1094 1095 elem.npeel = npeel; 1096 slot = LOOP_VINFO_PEELING_HTAB (loop_vinfo)->find (&elem); 1097 if (slot) 1098 slot->count++; 1099 else 1100 { 1101 slot = XNEW (struct _vect_peel_info); 1102 slot->npeel = npeel; 1103 slot->dr = dr; 1104 slot->count = 1; 1105 new_slot 1106 = LOOP_VINFO_PEELING_HTAB (loop_vinfo)->find_slot (slot, INSERT); 1107 *new_slot = slot; 1108 } 1109 1110 if (!supportable_dr_alignment 1111 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo))) 1112 slot->count += VECT_MAX_COST; 1113 } 1114 1115 1116 /* Traverse peeling hash table to find peeling option that aligns maximum 1117 number of data accesses. */ 1118 1119 int 1120 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot, 1121 _vect_peel_extended_info *max) 1122 { 1123 vect_peel_info elem = *slot; 1124 1125 if (elem->count > max->peel_info.count 1126 || (elem->count == max->peel_info.count 1127 && max->peel_info.npeel > elem->npeel)) 1128 { 1129 max->peel_info.npeel = elem->npeel; 1130 max->peel_info.count = elem->count; 1131 max->peel_info.dr = elem->dr; 1132 } 1133 1134 return 1; 1135 } 1136 1137 1138 /* Traverse peeling hash table and calculate cost for each peeling option. 1139 Find the one with the lowest cost. */ 1140 1141 int 1142 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot, 1143 _vect_peel_extended_info *min) 1144 { 1145 vect_peel_info elem = *slot; 1146 int save_misalignment, dummy; 1147 unsigned int inside_cost = 0, outside_cost = 0, i; 1148 gimple stmt = DR_STMT (elem->dr); 1149 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 1150 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 1151 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo); 1152 struct data_reference *dr; 1153 stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec; 1154 1155 prologue_cost_vec.create (2); 1156 body_cost_vec.create (2); 1157 epilogue_cost_vec.create (2); 1158 1159 FOR_EACH_VEC_ELT (datarefs, i, dr) 1160 { 1161 stmt = DR_STMT (dr); 1162 stmt_info = vinfo_for_stmt (stmt); 1163 /* For interleaving, only the alignment of the first access 1164 matters. */ 1165 if (STMT_VINFO_GROUPED_ACCESS (stmt_info) 1166 && GROUP_FIRST_ELEMENT (stmt_info) != stmt) 1167 continue; 1168 1169 save_misalignment = DR_MISALIGNMENT (dr); 1170 vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel); 1171 vect_get_data_access_cost (dr, &inside_cost, &outside_cost, 1172 &body_cost_vec); 1173 SET_DR_MISALIGNMENT (dr, save_misalignment); 1174 } 1175 1176 auto_vec<stmt_info_for_cost> scalar_cost_vec; 1177 vect_get_single_scalar_iteration_cost (loop_vinfo, &scalar_cost_vec); 1178 outside_cost += vect_get_known_peeling_cost 1179 (loop_vinfo, elem->npeel, &dummy, 1180 &scalar_cost_vec, &prologue_cost_vec, &epilogue_cost_vec); 1181 1182 /* Prologue and epilogue costs are added to the target model later. 1183 These costs depend only on the scalar iteration cost, the 1184 number of peeling iterations finally chosen, and the number of 1185 misaligned statements. So discard the information found here. */ 1186 prologue_cost_vec.release (); 1187 epilogue_cost_vec.release (); 1188 1189 if (inside_cost < min->inside_cost 1190 || (inside_cost == min->inside_cost && outside_cost < min->outside_cost)) 1191 { 1192 min->inside_cost = inside_cost; 1193 min->outside_cost = outside_cost; 1194 min->body_cost_vec.release (); 1195 min->body_cost_vec = body_cost_vec; 1196 min->peel_info.dr = elem->dr; 1197 min->peel_info.npeel = elem->npeel; 1198 } 1199 else 1200 body_cost_vec.release (); 1201 1202 return 1; 1203 } 1204 1205 1206 /* Choose best peeling option by traversing peeling hash table and either 1207 choosing an option with the lowest cost (if cost model is enabled) or the 1208 option that aligns as many accesses as possible. */ 1209 1210 static struct data_reference * 1211 vect_peeling_hash_choose_best_peeling (loop_vec_info loop_vinfo, 1212 unsigned int *npeel, 1213 stmt_vector_for_cost *body_cost_vec) 1214 { 1215 struct _vect_peel_extended_info res; 1216 1217 res.peel_info.dr = NULL; 1218 res.body_cost_vec = stmt_vector_for_cost (); 1219 1220 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo))) 1221 { 1222 res.inside_cost = INT_MAX; 1223 res.outside_cost = INT_MAX; 1224 LOOP_VINFO_PEELING_HTAB (loop_vinfo) 1225 ->traverse <_vect_peel_extended_info *, 1226 vect_peeling_hash_get_lowest_cost> (&res); 1227 } 1228 else 1229 { 1230 res.peel_info.count = 0; 1231 LOOP_VINFO_PEELING_HTAB (loop_vinfo) 1232 ->traverse <_vect_peel_extended_info *, 1233 vect_peeling_hash_get_most_frequent> (&res); 1234 } 1235 1236 *npeel = res.peel_info.npeel; 1237 *body_cost_vec = res.body_cost_vec; 1238 return res.peel_info.dr; 1239 } 1240 1241 1242 /* Function vect_enhance_data_refs_alignment 1243 1244 This pass will use loop versioning and loop peeling in order to enhance 1245 the alignment of data references in the loop. 1246 1247 FOR NOW: we assume that whatever versioning/peeling takes place, only the 1248 original loop is to be vectorized. Any other loops that are created by 1249 the transformations performed in this pass - are not supposed to be 1250 vectorized. This restriction will be relaxed. 1251 1252 This pass will require a cost model to guide it whether to apply peeling 1253 or versioning or a combination of the two. For example, the scheme that 1254 intel uses when given a loop with several memory accesses, is as follows: 1255 choose one memory access ('p') which alignment you want to force by doing 1256 peeling. Then, either (1) generate a loop in which 'p' is aligned and all 1257 other accesses are not necessarily aligned, or (2) use loop versioning to 1258 generate one loop in which all accesses are aligned, and another loop in 1259 which only 'p' is necessarily aligned. 1260 1261 ("Automatic Intra-Register Vectorization for the Intel Architecture", 1262 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International 1263 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.) 1264 1265 Devising a cost model is the most critical aspect of this work. It will 1266 guide us on which access to peel for, whether to use loop versioning, how 1267 many versions to create, etc. The cost model will probably consist of 1268 generic considerations as well as target specific considerations (on 1269 powerpc for example, misaligned stores are more painful than misaligned 1270 loads). 1271 1272 Here are the general steps involved in alignment enhancements: 1273 1274 -- original loop, before alignment analysis: 1275 for (i=0; i<N; i++){ 1276 x = q[i]; # DR_MISALIGNMENT(q) = unknown 1277 p[i] = y; # DR_MISALIGNMENT(p) = unknown 1278 } 1279 1280 -- After vect_compute_data_refs_alignment: 1281 for (i=0; i<N; i++){ 1282 x = q[i]; # DR_MISALIGNMENT(q) = 3 1283 p[i] = y; # DR_MISALIGNMENT(p) = unknown 1284 } 1285 1286 -- Possibility 1: we do loop versioning: 1287 if (p is aligned) { 1288 for (i=0; i<N; i++){ # loop 1A 1289 x = q[i]; # DR_MISALIGNMENT(q) = 3 1290 p[i] = y; # DR_MISALIGNMENT(p) = 0 1291 } 1292 } 1293 else { 1294 for (i=0; i<N; i++){ # loop 1B 1295 x = q[i]; # DR_MISALIGNMENT(q) = 3 1296 p[i] = y; # DR_MISALIGNMENT(p) = unaligned 1297 } 1298 } 1299 1300 -- Possibility 2: we do loop peeling: 1301 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized). 1302 x = q[i]; 1303 p[i] = y; 1304 } 1305 for (i = 3; i < N; i++){ # loop 2A 1306 x = q[i]; # DR_MISALIGNMENT(q) = 0 1307 p[i] = y; # DR_MISALIGNMENT(p) = unknown 1308 } 1309 1310 -- Possibility 3: combination of loop peeling and versioning: 1311 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized). 1312 x = q[i]; 1313 p[i] = y; 1314 } 1315 if (p is aligned) { 1316 for (i = 3; i<N; i++){ # loop 3A 1317 x = q[i]; # DR_MISALIGNMENT(q) = 0 1318 p[i] = y; # DR_MISALIGNMENT(p) = 0 1319 } 1320 } 1321 else { 1322 for (i = 3; i<N; i++){ # loop 3B 1323 x = q[i]; # DR_MISALIGNMENT(q) = 0 1324 p[i] = y; # DR_MISALIGNMENT(p) = unaligned 1325 } 1326 } 1327 1328 These loops are later passed to loop_transform to be vectorized. The 1329 vectorizer will use the alignment information to guide the transformation 1330 (whether to generate regular loads/stores, or with special handling for 1331 misalignment). */ 1332 1333 bool 1334 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo) 1335 { 1336 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo); 1337 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 1338 enum dr_alignment_support supportable_dr_alignment; 1339 struct data_reference *dr0 = NULL, *first_store = NULL; 1340 struct data_reference *dr; 1341 unsigned int i, j; 1342 bool do_peeling = false; 1343 bool do_versioning = false; 1344 bool stat; 1345 gimple stmt; 1346 stmt_vec_info stmt_info; 1347 unsigned int npeel = 0; 1348 bool all_misalignments_unknown = true; 1349 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 1350 unsigned possible_npeel_number = 1; 1351 tree vectype; 1352 unsigned int nelements, mis, same_align_drs_max = 0; 1353 stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost (); 1354 1355 if (dump_enabled_p ()) 1356 dump_printf_loc (MSG_NOTE, vect_location, 1357 "=== vect_enhance_data_refs_alignment ===\n"); 1358 1359 /* While cost model enhancements are expected in the future, the high level 1360 view of the code at this time is as follows: 1361 1362 A) If there is a misaligned access then see if peeling to align 1363 this access can make all data references satisfy 1364 vect_supportable_dr_alignment. If so, update data structures 1365 as needed and return true. 1366 1367 B) If peeling wasn't possible and there is a data reference with an 1368 unknown misalignment that does not satisfy vect_supportable_dr_alignment 1369 then see if loop versioning checks can be used to make all data 1370 references satisfy vect_supportable_dr_alignment. If so, update 1371 data structures as needed and return true. 1372 1373 C) If neither peeling nor versioning were successful then return false if 1374 any data reference does not satisfy vect_supportable_dr_alignment. 1375 1376 D) Return true (all data references satisfy vect_supportable_dr_alignment). 1377 1378 Note, Possibility 3 above (which is peeling and versioning together) is not 1379 being done at this time. */ 1380 1381 /* (1) Peeling to force alignment. */ 1382 1383 /* (1.1) Decide whether to perform peeling, and how many iterations to peel: 1384 Considerations: 1385 + How many accesses will become aligned due to the peeling 1386 - How many accesses will become unaligned due to the peeling, 1387 and the cost of misaligned accesses. 1388 - The cost of peeling (the extra runtime checks, the increase 1389 in code size). */ 1390 1391 FOR_EACH_VEC_ELT (datarefs, i, dr) 1392 { 1393 stmt = DR_STMT (dr); 1394 stmt_info = vinfo_for_stmt (stmt); 1395 1396 if (!STMT_VINFO_RELEVANT_P (stmt_info)) 1397 continue; 1398 1399 /* For interleaving, only the alignment of the first access 1400 matters. */ 1401 if (STMT_VINFO_GROUPED_ACCESS (stmt_info) 1402 && GROUP_FIRST_ELEMENT (stmt_info) != stmt) 1403 continue; 1404 1405 /* For invariant accesses there is nothing to enhance. */ 1406 if (integer_zerop (DR_STEP (dr))) 1407 continue; 1408 1409 /* Strided loads perform only component accesses, alignment is 1410 irrelevant for them. */ 1411 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info)) 1412 continue; 1413 1414 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true); 1415 do_peeling = vector_alignment_reachable_p (dr); 1416 if (do_peeling) 1417 { 1418 if (known_alignment_for_access_p (dr)) 1419 { 1420 unsigned int npeel_tmp; 1421 bool negative = tree_int_cst_compare (DR_STEP (dr), 1422 size_zero_node) < 0; 1423 1424 /* Save info about DR in the hash table. */ 1425 if (!LOOP_VINFO_PEELING_HTAB (loop_vinfo)) 1426 LOOP_VINFO_PEELING_HTAB (loop_vinfo) 1427 = new hash_table<peel_info_hasher> (1); 1428 1429 vectype = STMT_VINFO_VECTYPE (stmt_info); 1430 nelements = TYPE_VECTOR_SUBPARTS (vectype); 1431 mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE ( 1432 TREE_TYPE (DR_REF (dr)))); 1433 npeel_tmp = (negative 1434 ? (mis - nelements) : (nelements - mis)) 1435 & (nelements - 1); 1436 1437 /* For multiple types, it is possible that the bigger type access 1438 will have more than one peeling option. E.g., a loop with two 1439 types: one of size (vector size / 4), and the other one of 1440 size (vector size / 8). Vectorization factor will 8. If both 1441 access are misaligned by 3, the first one needs one scalar 1442 iteration to be aligned, and the second one needs 5. But the 1443 the first one will be aligned also by peeling 5 scalar 1444 iterations, and in that case both accesses will be aligned. 1445 Hence, except for the immediate peeling amount, we also want 1446 to try to add full vector size, while we don't exceed 1447 vectorization factor. 1448 We do this automtically for cost model, since we calculate cost 1449 for every peeling option. */ 1450 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo))) 1451 possible_npeel_number = vf /nelements; 1452 1453 /* Handle the aligned case. We may decide to align some other 1454 access, making DR unaligned. */ 1455 if (DR_MISALIGNMENT (dr) == 0) 1456 { 1457 npeel_tmp = 0; 1458 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo))) 1459 possible_npeel_number++; 1460 } 1461 1462 for (j = 0; j < possible_npeel_number; j++) 1463 { 1464 gcc_assert (npeel_tmp <= vf); 1465 vect_peeling_hash_insert (loop_vinfo, dr, npeel_tmp); 1466 npeel_tmp += nelements; 1467 } 1468 1469 all_misalignments_unknown = false; 1470 /* Data-ref that was chosen for the case that all the 1471 misalignments are unknown is not relevant anymore, since we 1472 have a data-ref with known alignment. */ 1473 dr0 = NULL; 1474 } 1475 else 1476 { 1477 /* If we don't know any misalignment values, we prefer 1478 peeling for data-ref that has the maximum number of data-refs 1479 with the same alignment, unless the target prefers to align 1480 stores over load. */ 1481 if (all_misalignments_unknown) 1482 { 1483 unsigned same_align_drs 1484 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length (); 1485 if (!dr0 1486 || same_align_drs_max < same_align_drs) 1487 { 1488 same_align_drs_max = same_align_drs; 1489 dr0 = dr; 1490 } 1491 /* For data-refs with the same number of related 1492 accesses prefer the one where the misalign 1493 computation will be invariant in the outermost loop. */ 1494 else if (same_align_drs_max == same_align_drs) 1495 { 1496 struct loop *ivloop0, *ivloop; 1497 ivloop0 = outermost_invariant_loop_for_expr 1498 (loop, DR_BASE_ADDRESS (dr0)); 1499 ivloop = outermost_invariant_loop_for_expr 1500 (loop, DR_BASE_ADDRESS (dr)); 1501 if ((ivloop && !ivloop0) 1502 || (ivloop && ivloop0 1503 && flow_loop_nested_p (ivloop, ivloop0))) 1504 dr0 = dr; 1505 } 1506 1507 if (!first_store && DR_IS_WRITE (dr)) 1508 first_store = dr; 1509 } 1510 1511 /* If there are both known and unknown misaligned accesses in the 1512 loop, we choose peeling amount according to the known 1513 accesses. */ 1514 if (!supportable_dr_alignment) 1515 { 1516 dr0 = dr; 1517 if (!first_store && DR_IS_WRITE (dr)) 1518 first_store = dr; 1519 } 1520 } 1521 } 1522 else 1523 { 1524 if (!aligned_access_p (dr)) 1525 { 1526 if (dump_enabled_p ()) 1527 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1528 "vector alignment may not be reachable\n"); 1529 break; 1530 } 1531 } 1532 } 1533 1534 /* Check if we can possibly peel the loop. */ 1535 if (!vect_can_advance_ivs_p (loop_vinfo) 1536 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))) 1537 do_peeling = false; 1538 1539 /* If we don't know how many times the peeling loop will run 1540 assume it will run VF-1 times and disable peeling if the remaining 1541 iters are less than the vectorization factor. */ 1542 if (do_peeling 1543 && all_misalignments_unknown 1544 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) 1545 && (LOOP_VINFO_INT_NITERS (loop_vinfo) 1546 < 2 * (unsigned) LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1)) 1547 do_peeling = false; 1548 1549 if (do_peeling 1550 && all_misalignments_unknown 1551 && vect_supportable_dr_alignment (dr0, false)) 1552 { 1553 /* Check if the target requires to prefer stores over loads, i.e., if 1554 misaligned stores are more expensive than misaligned loads (taking 1555 drs with same alignment into account). */ 1556 if (first_store && DR_IS_READ (dr0)) 1557 { 1558 unsigned int load_inside_cost = 0, load_outside_cost = 0; 1559 unsigned int store_inside_cost = 0, store_outside_cost = 0; 1560 unsigned int load_inside_penalty = 0, load_outside_penalty = 0; 1561 unsigned int store_inside_penalty = 0, store_outside_penalty = 0; 1562 stmt_vector_for_cost dummy; 1563 dummy.create (2); 1564 1565 vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost, 1566 &dummy); 1567 vect_get_data_access_cost (first_store, &store_inside_cost, 1568 &store_outside_cost, &dummy); 1569 1570 dummy.release (); 1571 1572 /* Calculate the penalty for leaving FIRST_STORE unaligned (by 1573 aligning the load DR0). */ 1574 load_inside_penalty = store_inside_cost; 1575 load_outside_penalty = store_outside_cost; 1576 for (i = 0; 1577 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt ( 1578 DR_STMT (first_store))).iterate (i, &dr); 1579 i++) 1580 if (DR_IS_READ (dr)) 1581 { 1582 load_inside_penalty += load_inside_cost; 1583 load_outside_penalty += load_outside_cost; 1584 } 1585 else 1586 { 1587 load_inside_penalty += store_inside_cost; 1588 load_outside_penalty += store_outside_cost; 1589 } 1590 1591 /* Calculate the penalty for leaving DR0 unaligned (by 1592 aligning the FIRST_STORE). */ 1593 store_inside_penalty = load_inside_cost; 1594 store_outside_penalty = load_outside_cost; 1595 for (i = 0; 1596 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt ( 1597 DR_STMT (dr0))).iterate (i, &dr); 1598 i++) 1599 if (DR_IS_READ (dr)) 1600 { 1601 store_inside_penalty += load_inside_cost; 1602 store_outside_penalty += load_outside_cost; 1603 } 1604 else 1605 { 1606 store_inside_penalty += store_inside_cost; 1607 store_outside_penalty += store_outside_cost; 1608 } 1609 1610 if (load_inside_penalty > store_inside_penalty 1611 || (load_inside_penalty == store_inside_penalty 1612 && load_outside_penalty > store_outside_penalty)) 1613 dr0 = first_store; 1614 } 1615 1616 /* In case there are only loads with different unknown misalignments, use 1617 peeling only if it may help to align other accesses in the loop. */ 1618 if (!first_store 1619 && !STMT_VINFO_SAME_ALIGN_REFS ( 1620 vinfo_for_stmt (DR_STMT (dr0))).length () 1621 && vect_supportable_dr_alignment (dr0, false) 1622 != dr_unaligned_supported) 1623 do_peeling = false; 1624 } 1625 1626 if (do_peeling && !dr0) 1627 { 1628 /* Peeling is possible, but there is no data access that is not supported 1629 unless aligned. So we try to choose the best possible peeling. */ 1630 1631 /* We should get here only if there are drs with known misalignment. */ 1632 gcc_assert (!all_misalignments_unknown); 1633 1634 /* Choose the best peeling from the hash table. */ 1635 dr0 = vect_peeling_hash_choose_best_peeling (loop_vinfo, &npeel, 1636 &body_cost_vec); 1637 if (!dr0 || !npeel) 1638 do_peeling = false; 1639 1640 /* If peeling by npeel will result in a remaining loop not iterating 1641 enough to be vectorized then do not peel. */ 1642 if (do_peeling 1643 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) 1644 && (LOOP_VINFO_INT_NITERS (loop_vinfo) 1645 < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + npeel)) 1646 do_peeling = false; 1647 } 1648 1649 if (do_peeling) 1650 { 1651 stmt = DR_STMT (dr0); 1652 stmt_info = vinfo_for_stmt (stmt); 1653 vectype = STMT_VINFO_VECTYPE (stmt_info); 1654 nelements = TYPE_VECTOR_SUBPARTS (vectype); 1655 1656 if (known_alignment_for_access_p (dr0)) 1657 { 1658 bool negative = tree_int_cst_compare (DR_STEP (dr0), 1659 size_zero_node) < 0; 1660 if (!npeel) 1661 { 1662 /* Since it's known at compile time, compute the number of 1663 iterations in the peeled loop (the peeling factor) for use in 1664 updating DR_MISALIGNMENT values. The peeling factor is the 1665 vectorization factor minus the misalignment as an element 1666 count. */ 1667 mis = DR_MISALIGNMENT (dr0); 1668 mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0)))); 1669 npeel = ((negative ? mis - nelements : nelements - mis) 1670 & (nelements - 1)); 1671 } 1672 1673 /* For interleaved data access every iteration accesses all the 1674 members of the group, therefore we divide the number of iterations 1675 by the group size. */ 1676 stmt_info = vinfo_for_stmt (DR_STMT (dr0)); 1677 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) 1678 npeel /= GROUP_SIZE (stmt_info); 1679 1680 if (dump_enabled_p ()) 1681 dump_printf_loc (MSG_NOTE, vect_location, 1682 "Try peeling by %d\n", npeel); 1683 } 1684 1685 /* Ensure that all data refs can be vectorized after the peel. */ 1686 FOR_EACH_VEC_ELT (datarefs, i, dr) 1687 { 1688 int save_misalignment; 1689 1690 if (dr == dr0) 1691 continue; 1692 1693 stmt = DR_STMT (dr); 1694 stmt_info = vinfo_for_stmt (stmt); 1695 /* For interleaving, only the alignment of the first access 1696 matters. */ 1697 if (STMT_VINFO_GROUPED_ACCESS (stmt_info) 1698 && GROUP_FIRST_ELEMENT (stmt_info) != stmt) 1699 continue; 1700 1701 /* Strided loads perform only component accesses, alignment is 1702 irrelevant for them. */ 1703 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info)) 1704 continue; 1705 1706 save_misalignment = DR_MISALIGNMENT (dr); 1707 vect_update_misalignment_for_peel (dr, dr0, npeel); 1708 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false); 1709 SET_DR_MISALIGNMENT (dr, save_misalignment); 1710 1711 if (!supportable_dr_alignment) 1712 { 1713 do_peeling = false; 1714 break; 1715 } 1716 } 1717 1718 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0) 1719 { 1720 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL); 1721 if (!stat) 1722 do_peeling = false; 1723 else 1724 { 1725 body_cost_vec.release (); 1726 return stat; 1727 } 1728 } 1729 1730 if (do_peeling) 1731 { 1732 unsigned max_allowed_peel 1733 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT); 1734 if (max_allowed_peel != (unsigned)-1) 1735 { 1736 unsigned max_peel = npeel; 1737 if (max_peel == 0) 1738 { 1739 gimple dr_stmt = DR_STMT (dr0); 1740 stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt); 1741 tree vtype = STMT_VINFO_VECTYPE (vinfo); 1742 max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1; 1743 } 1744 if (max_peel > max_allowed_peel) 1745 { 1746 do_peeling = false; 1747 if (dump_enabled_p ()) 1748 dump_printf_loc (MSG_NOTE, vect_location, 1749 "Disable peeling, max peels reached: %d\n", max_peel); 1750 } 1751 } 1752 } 1753 1754 if (do_peeling) 1755 { 1756 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i. 1757 If the misalignment of DR_i is identical to that of dr0 then set 1758 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and 1759 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i) 1760 by the peeling factor times the element size of DR_i (MOD the 1761 vectorization factor times the size). Otherwise, the 1762 misalignment of DR_i must be set to unknown. */ 1763 FOR_EACH_VEC_ELT (datarefs, i, dr) 1764 if (dr != dr0) 1765 vect_update_misalignment_for_peel (dr, dr0, npeel); 1766 1767 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0; 1768 if (npeel) 1769 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel; 1770 else 1771 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) 1772 = DR_MISALIGNMENT (dr0); 1773 SET_DR_MISALIGNMENT (dr0, 0); 1774 if (dump_enabled_p ()) 1775 { 1776 dump_printf_loc (MSG_NOTE, vect_location, 1777 "Alignment of access forced using peeling.\n"); 1778 dump_printf_loc (MSG_NOTE, vect_location, 1779 "Peeling for alignment will be applied.\n"); 1780 } 1781 /* The inside-loop cost will be accounted for in vectorizable_load 1782 and vectorizable_store correctly with adjusted alignments. 1783 Drop the body_cst_vec on the floor here. */ 1784 body_cost_vec.release (); 1785 1786 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL); 1787 gcc_assert (stat); 1788 return stat; 1789 } 1790 } 1791 1792 body_cost_vec.release (); 1793 1794 /* (2) Versioning to force alignment. */ 1795 1796 /* Try versioning if: 1797 1) optimize loop for speed 1798 2) there is at least one unsupported misaligned data ref with an unknown 1799 misalignment, and 1800 3) all misaligned data refs with a known misalignment are supported, and 1801 4) the number of runtime alignment checks is within reason. */ 1802 1803 do_versioning = 1804 optimize_loop_nest_for_speed_p (loop) 1805 && (!loop->inner); /* FORNOW */ 1806 1807 if (do_versioning) 1808 { 1809 FOR_EACH_VEC_ELT (datarefs, i, dr) 1810 { 1811 stmt = DR_STMT (dr); 1812 stmt_info = vinfo_for_stmt (stmt); 1813 1814 /* For interleaving, only the alignment of the first access 1815 matters. */ 1816 if (aligned_access_p (dr) 1817 || (STMT_VINFO_GROUPED_ACCESS (stmt_info) 1818 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)) 1819 continue; 1820 1821 /* Strided loads perform only component accesses, alignment is 1822 irrelevant for them. */ 1823 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info)) 1824 continue; 1825 1826 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false); 1827 1828 if (!supportable_dr_alignment) 1829 { 1830 gimple stmt; 1831 int mask; 1832 tree vectype; 1833 1834 if (known_alignment_for_access_p (dr) 1835 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length () 1836 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS)) 1837 { 1838 do_versioning = false; 1839 break; 1840 } 1841 1842 stmt = DR_STMT (dr); 1843 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)); 1844 gcc_assert (vectype); 1845 1846 /* The rightmost bits of an aligned address must be zeros. 1847 Construct the mask needed for this test. For example, 1848 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the 1849 mask must be 15 = 0xf. */ 1850 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1; 1851 1852 /* FORNOW: use the same mask to test all potentially unaligned 1853 references in the loop. The vectorizer currently supports 1854 a single vector size, see the reference to 1855 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the 1856 vectorization factor is computed. */ 1857 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo) 1858 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask); 1859 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask; 1860 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push ( 1861 DR_STMT (dr)); 1862 } 1863 } 1864 1865 /* Versioning requires at least one misaligned data reference. */ 1866 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)) 1867 do_versioning = false; 1868 else if (!do_versioning) 1869 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0); 1870 } 1871 1872 if (do_versioning) 1873 { 1874 vec<gimple> may_misalign_stmts 1875 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo); 1876 gimple stmt; 1877 1878 /* It can now be assumed that the data references in the statements 1879 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version 1880 of the loop being vectorized. */ 1881 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt) 1882 { 1883 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 1884 dr = STMT_VINFO_DATA_REF (stmt_info); 1885 SET_DR_MISALIGNMENT (dr, 0); 1886 if (dump_enabled_p ()) 1887 dump_printf_loc (MSG_NOTE, vect_location, 1888 "Alignment of access forced using versioning.\n"); 1889 } 1890 1891 if (dump_enabled_p ()) 1892 dump_printf_loc (MSG_NOTE, vect_location, 1893 "Versioning for alignment will be applied.\n"); 1894 1895 /* Peeling and versioning can't be done together at this time. */ 1896 gcc_assert (! (do_peeling && do_versioning)); 1897 1898 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL); 1899 gcc_assert (stat); 1900 return stat; 1901 } 1902 1903 /* This point is reached if neither peeling nor versioning is being done. */ 1904 gcc_assert (! (do_peeling || do_versioning)); 1905 1906 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL); 1907 return stat; 1908 } 1909 1910 1911 /* Function vect_find_same_alignment_drs. 1912 1913 Update group and alignment relations according to the chosen 1914 vectorization factor. */ 1915 1916 static void 1917 vect_find_same_alignment_drs (struct data_dependence_relation *ddr, 1918 loop_vec_info loop_vinfo) 1919 { 1920 unsigned int i; 1921 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 1922 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 1923 struct data_reference *dra = DDR_A (ddr); 1924 struct data_reference *drb = DDR_B (ddr); 1925 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra)); 1926 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb)); 1927 int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra)))); 1928 int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb)))); 1929 lambda_vector dist_v; 1930 unsigned int loop_depth; 1931 1932 if (DDR_ARE_DEPENDENT (ddr) == chrec_known) 1933 return; 1934 1935 if (dra == drb) 1936 return; 1937 1938 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know) 1939 return; 1940 1941 /* Loop-based vectorization and known data dependence. */ 1942 if (DDR_NUM_DIST_VECTS (ddr) == 0) 1943 return; 1944 1945 /* Data-dependence analysis reports a distance vector of zero 1946 for data-references that overlap only in the first iteration 1947 but have different sign step (see PR45764). 1948 So as a sanity check require equal DR_STEP. */ 1949 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0)) 1950 return; 1951 1952 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr)); 1953 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v) 1954 { 1955 int dist = dist_v[loop_depth]; 1956 1957 if (dump_enabled_p ()) 1958 dump_printf_loc (MSG_NOTE, vect_location, 1959 "dependence distance = %d.\n", dist); 1960 1961 /* Same loop iteration. */ 1962 if (dist == 0 1963 || (dist % vectorization_factor == 0 && dra_size == drb_size)) 1964 { 1965 /* Two references with distance zero have the same alignment. */ 1966 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb); 1967 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra); 1968 if (dump_enabled_p ()) 1969 { 1970 dump_printf_loc (MSG_NOTE, vect_location, 1971 "accesses have the same alignment.\n"); 1972 dump_printf (MSG_NOTE, 1973 "dependence distance modulo vf == 0 between "); 1974 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra)); 1975 dump_printf (MSG_NOTE, " and "); 1976 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb)); 1977 dump_printf (MSG_NOTE, "\n"); 1978 } 1979 } 1980 } 1981 } 1982 1983 1984 /* Function vect_analyze_data_refs_alignment 1985 1986 Analyze the alignment of the data-references in the loop. 1987 Return FALSE if a data reference is found that cannot be vectorized. */ 1988 1989 bool 1990 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo, 1991 bb_vec_info bb_vinfo) 1992 { 1993 if (dump_enabled_p ()) 1994 dump_printf_loc (MSG_NOTE, vect_location, 1995 "=== vect_analyze_data_refs_alignment ===\n"); 1996 1997 /* Mark groups of data references with same alignment using 1998 data dependence information. */ 1999 if (loop_vinfo) 2000 { 2001 vec<ddr_p> ddrs = LOOP_VINFO_DDRS (loop_vinfo); 2002 struct data_dependence_relation *ddr; 2003 unsigned int i; 2004 2005 FOR_EACH_VEC_ELT (ddrs, i, ddr) 2006 vect_find_same_alignment_drs (ddr, loop_vinfo); 2007 } 2008 2009 if (!vect_compute_data_refs_alignment (loop_vinfo, bb_vinfo)) 2010 { 2011 if (dump_enabled_p ()) 2012 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2013 "not vectorized: can't calculate alignment " 2014 "for data ref.\n"); 2015 return false; 2016 } 2017 2018 return true; 2019 } 2020 2021 2022 /* Analyze groups of accesses: check that DR belongs to a group of 2023 accesses of legal size, step, etc. Detect gaps, single element 2024 interleaving, and other special cases. Set grouped access info. 2025 Collect groups of strided stores for further use in SLP analysis. */ 2026 2027 static bool 2028 vect_analyze_group_access (struct data_reference *dr) 2029 { 2030 tree step = DR_STEP (dr); 2031 tree scalar_type = TREE_TYPE (DR_REF (dr)); 2032 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type)); 2033 gimple stmt = DR_STMT (dr); 2034 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 2035 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 2036 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info); 2037 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step); 2038 HOST_WIDE_INT groupsize, last_accessed_element = 1; 2039 bool slp_impossible = false; 2040 struct loop *loop = NULL; 2041 2042 if (loop_vinfo) 2043 loop = LOOP_VINFO_LOOP (loop_vinfo); 2044 2045 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the 2046 size of the interleaving group (including gaps). */ 2047 groupsize = absu_hwi (dr_step) / type_size; 2048 2049 /* Not consecutive access is possible only if it is a part of interleaving. */ 2050 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))) 2051 { 2052 /* Check if it this DR is a part of interleaving, and is a single 2053 element of the group that is accessed in the loop. */ 2054 2055 /* Gaps are supported only for loads. STEP must be a multiple of the type 2056 size. The size of the group must be a power of 2. */ 2057 if (DR_IS_READ (dr) 2058 && (dr_step % type_size) == 0 2059 && groupsize > 0 2060 && exact_log2 (groupsize) != -1) 2061 { 2062 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt; 2063 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize; 2064 if (dump_enabled_p ()) 2065 { 2066 dump_printf_loc (MSG_NOTE, vect_location, 2067 "Detected single element interleaving "); 2068 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr)); 2069 dump_printf (MSG_NOTE, " step "); 2070 dump_generic_expr (MSG_NOTE, TDF_SLIM, step); 2071 dump_printf (MSG_NOTE, "\n"); 2072 } 2073 2074 if (loop_vinfo) 2075 { 2076 if (dump_enabled_p ()) 2077 dump_printf_loc (MSG_NOTE, vect_location, 2078 "Data access with gaps requires scalar " 2079 "epilogue loop\n"); 2080 if (loop->inner) 2081 { 2082 if (dump_enabled_p ()) 2083 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2084 "Peeling for outer loop is not" 2085 " supported\n"); 2086 return false; 2087 } 2088 2089 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true; 2090 } 2091 2092 return true; 2093 } 2094 2095 if (dump_enabled_p ()) 2096 { 2097 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2098 "not consecutive access "); 2099 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 2100 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 2101 } 2102 2103 if (bb_vinfo) 2104 { 2105 /* Mark the statement as unvectorizable. */ 2106 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false; 2107 return true; 2108 } 2109 2110 return false; 2111 } 2112 2113 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt) 2114 { 2115 /* First stmt in the interleaving chain. Check the chain. */ 2116 gimple next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt)); 2117 struct data_reference *data_ref = dr; 2118 unsigned int count = 1; 2119 tree prev_init = DR_INIT (data_ref); 2120 gimple prev = stmt; 2121 HOST_WIDE_INT diff, gaps = 0; 2122 unsigned HOST_WIDE_INT count_in_bytes; 2123 2124 while (next) 2125 { 2126 /* Skip same data-refs. In case that two or more stmts share 2127 data-ref (supported only for loads), we vectorize only the first 2128 stmt, and the rest get their vectorized loads from the first 2129 one. */ 2130 if (!tree_int_cst_compare (DR_INIT (data_ref), 2131 DR_INIT (STMT_VINFO_DATA_REF ( 2132 vinfo_for_stmt (next))))) 2133 { 2134 if (DR_IS_WRITE (data_ref)) 2135 { 2136 if (dump_enabled_p ()) 2137 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2138 "Two store stmts share the same dr.\n"); 2139 return false; 2140 } 2141 2142 /* For load use the same data-ref load. */ 2143 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev; 2144 2145 prev = next; 2146 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next)); 2147 continue; 2148 } 2149 2150 prev = next; 2151 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next)); 2152 2153 /* All group members have the same STEP by construction. */ 2154 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0)); 2155 2156 /* Check that the distance between two accesses is equal to the type 2157 size. Otherwise, we have gaps. */ 2158 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref)) 2159 - TREE_INT_CST_LOW (prev_init)) / type_size; 2160 if (diff != 1) 2161 { 2162 /* FORNOW: SLP of accesses with gaps is not supported. */ 2163 slp_impossible = true; 2164 if (DR_IS_WRITE (data_ref)) 2165 { 2166 if (dump_enabled_p ()) 2167 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2168 "interleaved store with gaps\n"); 2169 return false; 2170 } 2171 2172 gaps += diff - 1; 2173 } 2174 2175 last_accessed_element += diff; 2176 2177 /* Store the gap from the previous member of the group. If there is no 2178 gap in the access, GROUP_GAP is always 1. */ 2179 GROUP_GAP (vinfo_for_stmt (next)) = diff; 2180 2181 prev_init = DR_INIT (data_ref); 2182 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next)); 2183 /* Count the number of data-refs in the chain. */ 2184 count++; 2185 } 2186 2187 /* COUNT is the number of accesses found, we multiply it by the size of 2188 the type to get COUNT_IN_BYTES. */ 2189 count_in_bytes = type_size * count; 2190 2191 /* Check that the size of the interleaving (including gaps) is not 2192 greater than STEP. */ 2193 if (dr_step != 0 2194 && absu_hwi (dr_step) < count_in_bytes + gaps * type_size) 2195 { 2196 if (dump_enabled_p ()) 2197 { 2198 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2199 "interleaving size is greater than step for "); 2200 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 2201 DR_REF (dr)); 2202 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 2203 } 2204 return false; 2205 } 2206 2207 /* Check that the size of the interleaving is equal to STEP for stores, 2208 i.e., that there are no gaps. */ 2209 if (dr_step != 0 2210 && absu_hwi (dr_step) != count_in_bytes) 2211 { 2212 if (DR_IS_READ (dr)) 2213 { 2214 slp_impossible = true; 2215 /* There is a gap after the last load in the group. This gap is a 2216 difference between the groupsize and the number of elements. 2217 When there is no gap, this difference should be 0. */ 2218 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - count; 2219 } 2220 else 2221 { 2222 if (dump_enabled_p ()) 2223 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2224 "interleaved store with gaps\n"); 2225 return false; 2226 } 2227 } 2228 2229 /* Check that STEP is a multiple of type size. */ 2230 if (dr_step != 0 2231 && (dr_step % type_size) != 0) 2232 { 2233 if (dump_enabled_p ()) 2234 { 2235 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2236 "step is not a multiple of type size: step "); 2237 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, step); 2238 dump_printf (MSG_MISSED_OPTIMIZATION, " size "); 2239 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 2240 TYPE_SIZE_UNIT (scalar_type)); 2241 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 2242 } 2243 return false; 2244 } 2245 2246 if (groupsize == 0) 2247 groupsize = count; 2248 2249 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize; 2250 if (dump_enabled_p ()) 2251 dump_printf_loc (MSG_NOTE, vect_location, 2252 "Detected interleaving of size %d\n", (int)groupsize); 2253 2254 /* SLP: create an SLP data structure for every interleaving group of 2255 stores for further analysis in vect_analyse_slp. */ 2256 if (DR_IS_WRITE (dr) && !slp_impossible) 2257 { 2258 if (loop_vinfo) 2259 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt); 2260 if (bb_vinfo) 2261 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt); 2262 } 2263 2264 /* There is a gap in the end of the group. */ 2265 if (groupsize - last_accessed_element > 0 && loop_vinfo) 2266 { 2267 if (dump_enabled_p ()) 2268 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2269 "Data access with gaps requires scalar " 2270 "epilogue loop\n"); 2271 if (loop->inner) 2272 { 2273 if (dump_enabled_p ()) 2274 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2275 "Peeling for outer loop is not supported\n"); 2276 return false; 2277 } 2278 2279 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true; 2280 } 2281 } 2282 2283 return true; 2284 } 2285 2286 2287 /* Analyze the access pattern of the data-reference DR. 2288 In case of non-consecutive accesses call vect_analyze_group_access() to 2289 analyze groups of accesses. */ 2290 2291 static bool 2292 vect_analyze_data_ref_access (struct data_reference *dr) 2293 { 2294 tree step = DR_STEP (dr); 2295 tree scalar_type = TREE_TYPE (DR_REF (dr)); 2296 gimple stmt = DR_STMT (dr); 2297 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 2298 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 2299 struct loop *loop = NULL; 2300 2301 if (loop_vinfo) 2302 loop = LOOP_VINFO_LOOP (loop_vinfo); 2303 2304 if (loop_vinfo && !step) 2305 { 2306 if (dump_enabled_p ()) 2307 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2308 "bad data-ref access in loop\n"); 2309 return false; 2310 } 2311 2312 /* Allow invariant loads in not nested loops. */ 2313 if (loop_vinfo && integer_zerop (step)) 2314 { 2315 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL; 2316 if (nested_in_vect_loop_p (loop, stmt)) 2317 { 2318 if (dump_enabled_p ()) 2319 dump_printf_loc (MSG_NOTE, vect_location, 2320 "zero step in inner loop of nest\n"); 2321 return false; 2322 } 2323 return DR_IS_READ (dr); 2324 } 2325 2326 if (loop && nested_in_vect_loop_p (loop, stmt)) 2327 { 2328 /* Interleaved accesses are not yet supported within outer-loop 2329 vectorization for references in the inner-loop. */ 2330 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL; 2331 2332 /* For the rest of the analysis we use the outer-loop step. */ 2333 step = STMT_VINFO_DR_STEP (stmt_info); 2334 if (integer_zerop (step)) 2335 { 2336 if (dump_enabled_p ()) 2337 dump_printf_loc (MSG_NOTE, vect_location, 2338 "zero step in outer loop.\n"); 2339 if (DR_IS_READ (dr)) 2340 return true; 2341 else 2342 return false; 2343 } 2344 } 2345 2346 /* Consecutive? */ 2347 if (TREE_CODE (step) == INTEGER_CST) 2348 { 2349 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step); 2350 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type)) 2351 || (dr_step < 0 2352 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step))) 2353 { 2354 /* Mark that it is not interleaving. */ 2355 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL; 2356 return true; 2357 } 2358 } 2359 2360 if (loop && nested_in_vect_loop_p (loop, stmt)) 2361 { 2362 if (dump_enabled_p ()) 2363 dump_printf_loc (MSG_NOTE, vect_location, 2364 "grouped access in outer loop.\n"); 2365 return false; 2366 } 2367 2368 /* Assume this is a DR handled by non-constant strided load case. */ 2369 if (TREE_CODE (step) != INTEGER_CST) 2370 return STMT_VINFO_STRIDE_LOAD_P (stmt_info); 2371 2372 /* Not consecutive access - check if it's a part of interleaving group. */ 2373 return vect_analyze_group_access (dr); 2374 } 2375 2376 2377 2378 /* A helper function used in the comparator function to sort data 2379 references. T1 and T2 are two data references to be compared. 2380 The function returns -1, 0, or 1. */ 2381 2382 static int 2383 compare_tree (tree t1, tree t2) 2384 { 2385 int i, cmp; 2386 enum tree_code code; 2387 char tclass; 2388 2389 if (t1 == t2) 2390 return 0; 2391 if (t1 == NULL) 2392 return -1; 2393 if (t2 == NULL) 2394 return 1; 2395 2396 STRIP_NOPS (t1); 2397 STRIP_NOPS (t2); 2398 2399 if (TREE_CODE (t1) != TREE_CODE (t2)) 2400 return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1; 2401 2402 code = TREE_CODE (t1); 2403 switch (code) 2404 { 2405 /* For const values, we can just use hash values for comparisons. */ 2406 case INTEGER_CST: 2407 case REAL_CST: 2408 case FIXED_CST: 2409 case STRING_CST: 2410 case COMPLEX_CST: 2411 case VECTOR_CST: 2412 { 2413 hashval_t h1 = iterative_hash_expr (t1, 0); 2414 hashval_t h2 = iterative_hash_expr (t2, 0); 2415 if (h1 != h2) 2416 return h1 < h2 ? -1 : 1; 2417 break; 2418 } 2419 2420 case SSA_NAME: 2421 cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2)); 2422 if (cmp != 0) 2423 return cmp; 2424 2425 if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2)) 2426 return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1; 2427 break; 2428 2429 default: 2430 tclass = TREE_CODE_CLASS (code); 2431 2432 /* For var-decl, we could compare their UIDs. */ 2433 if (tclass == tcc_declaration) 2434 { 2435 if (DECL_UID (t1) != DECL_UID (t2)) 2436 return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1; 2437 break; 2438 } 2439 2440 /* For expressions with operands, compare their operands recursively. */ 2441 for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i) 2442 { 2443 cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i)); 2444 if (cmp != 0) 2445 return cmp; 2446 } 2447 } 2448 2449 return 0; 2450 } 2451 2452 2453 /* Compare two data-references DRA and DRB to group them into chunks 2454 suitable for grouping. */ 2455 2456 static int 2457 dr_group_sort_cmp (const void *dra_, const void *drb_) 2458 { 2459 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_); 2460 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_); 2461 int cmp; 2462 2463 /* Stabilize sort. */ 2464 if (dra == drb) 2465 return 0; 2466 2467 /* Ordering of DRs according to base. */ 2468 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)) 2469 { 2470 cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb)); 2471 if (cmp != 0) 2472 return cmp; 2473 } 2474 2475 /* And according to DR_OFFSET. */ 2476 if (!dr_equal_offsets_p (dra, drb)) 2477 { 2478 cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)); 2479 if (cmp != 0) 2480 return cmp; 2481 } 2482 2483 /* Put reads before writes. */ 2484 if (DR_IS_READ (dra) != DR_IS_READ (drb)) 2485 return DR_IS_READ (dra) ? -1 : 1; 2486 2487 /* Then sort after access size. */ 2488 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))), 2489 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0)) 2490 { 2491 cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))), 2492 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)))); 2493 if (cmp != 0) 2494 return cmp; 2495 } 2496 2497 /* And after step. */ 2498 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0)) 2499 { 2500 cmp = compare_tree (DR_STEP (dra), DR_STEP (drb)); 2501 if (cmp != 0) 2502 return cmp; 2503 } 2504 2505 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */ 2506 cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)); 2507 if (cmp == 0) 2508 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1; 2509 return cmp; 2510 } 2511 2512 /* Function vect_analyze_data_ref_accesses. 2513 2514 Analyze the access pattern of all the data references in the loop. 2515 2516 FORNOW: the only access pattern that is considered vectorizable is a 2517 simple step 1 (consecutive) access. 2518 2519 FORNOW: handle only arrays and pointer accesses. */ 2520 2521 bool 2522 vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo) 2523 { 2524 unsigned int i; 2525 vec<data_reference_p> datarefs; 2526 struct data_reference *dr; 2527 2528 if (dump_enabled_p ()) 2529 dump_printf_loc (MSG_NOTE, vect_location, 2530 "=== vect_analyze_data_ref_accesses ===\n"); 2531 2532 if (loop_vinfo) 2533 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo); 2534 else 2535 datarefs = BB_VINFO_DATAREFS (bb_vinfo); 2536 2537 if (datarefs.is_empty ()) 2538 return true; 2539 2540 /* Sort the array of datarefs to make building the interleaving chains 2541 linear. Don't modify the original vector's order, it is needed for 2542 determining what dependencies are reversed. */ 2543 vec<data_reference_p> datarefs_copy = datarefs.copy (); 2544 datarefs_copy.qsort (dr_group_sort_cmp); 2545 2546 /* Build the interleaving chains. */ 2547 for (i = 0; i < datarefs_copy.length () - 1;) 2548 { 2549 data_reference_p dra = datarefs_copy[i]; 2550 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra)); 2551 stmt_vec_info lastinfo = NULL; 2552 for (i = i + 1; i < datarefs_copy.length (); ++i) 2553 { 2554 data_reference_p drb = datarefs_copy[i]; 2555 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb)); 2556 2557 /* ??? Imperfect sorting (non-compatible types, non-modulo 2558 accesses, same accesses) can lead to a group to be artificially 2559 split here as we don't just skip over those. If it really 2560 matters we can push those to a worklist and re-iterate 2561 over them. The we can just skip ahead to the next DR here. */ 2562 2563 /* Check that the data-refs have same first location (except init) 2564 and they are both either store or load (not load and store, 2565 not masked loads or stores). */ 2566 if (DR_IS_READ (dra) != DR_IS_READ (drb) 2567 || !operand_equal_p (DR_BASE_ADDRESS (dra), 2568 DR_BASE_ADDRESS (drb), 0) 2569 || !dr_equal_offsets_p (dra, drb) 2570 || !gimple_assign_single_p (DR_STMT (dra)) 2571 || !gimple_assign_single_p (DR_STMT (drb))) 2572 break; 2573 2574 /* Check that the data-refs have the same constant size and step. */ 2575 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))); 2576 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))); 2577 if (!tree_fits_uhwi_p (sza) 2578 || !tree_fits_uhwi_p (szb) 2579 || !tree_int_cst_equal (sza, szb) 2580 || !tree_fits_shwi_p (DR_STEP (dra)) 2581 || !tree_fits_shwi_p (DR_STEP (drb)) 2582 || !tree_int_cst_equal (DR_STEP (dra), DR_STEP (drb))) 2583 break; 2584 2585 /* Do not place the same access in the interleaving chain twice. */ 2586 if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0) 2587 break; 2588 2589 /* Check the types are compatible. 2590 ??? We don't distinguish this during sorting. */ 2591 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)), 2592 TREE_TYPE (DR_REF (drb)))) 2593 break; 2594 2595 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */ 2596 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra)); 2597 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb)); 2598 gcc_assert (init_a <= init_b); 2599 2600 /* If init_b == init_a + the size of the type * k, we have an 2601 interleaving, and DRA is accessed before DRB. */ 2602 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza); 2603 if ((init_b - init_a) % type_size_a != 0) 2604 break; 2605 2606 /* The step (if not zero) is greater than the difference between 2607 data-refs' inits. This splits groups into suitable sizes. */ 2608 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra)); 2609 if (step != 0 && step <= (init_b - init_a)) 2610 break; 2611 2612 if (dump_enabled_p ()) 2613 { 2614 dump_printf_loc (MSG_NOTE, vect_location, 2615 "Detected interleaving "); 2616 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra)); 2617 dump_printf (MSG_NOTE, " and "); 2618 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb)); 2619 dump_printf (MSG_NOTE, "\n"); 2620 } 2621 2622 /* Link the found element into the group list. */ 2623 if (!GROUP_FIRST_ELEMENT (stmtinfo_a)) 2624 { 2625 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra); 2626 lastinfo = stmtinfo_a; 2627 } 2628 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra); 2629 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb); 2630 lastinfo = stmtinfo_b; 2631 } 2632 } 2633 2634 FOR_EACH_VEC_ELT (datarefs_copy, i, dr) 2635 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) 2636 && !vect_analyze_data_ref_access (dr)) 2637 { 2638 if (dump_enabled_p ()) 2639 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2640 "not vectorized: complicated access pattern.\n"); 2641 2642 if (bb_vinfo) 2643 { 2644 /* Mark the statement as not vectorizable. */ 2645 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false; 2646 continue; 2647 } 2648 else 2649 { 2650 datarefs_copy.release (); 2651 return false; 2652 } 2653 } 2654 2655 datarefs_copy.release (); 2656 return true; 2657 } 2658 2659 2660 /* Operator == between two dr_with_seg_len objects. 2661 2662 This equality operator is used to make sure two data refs 2663 are the same one so that we will consider to combine the 2664 aliasing checks of those two pairs of data dependent data 2665 refs. */ 2666 2667 static bool 2668 operator == (const dr_with_seg_len& d1, 2669 const dr_with_seg_len& d2) 2670 { 2671 return operand_equal_p (DR_BASE_ADDRESS (d1.dr), 2672 DR_BASE_ADDRESS (d2.dr), 0) 2673 && compare_tree (d1.offset, d2.offset) == 0 2674 && compare_tree (d1.seg_len, d2.seg_len) == 0; 2675 } 2676 2677 /* Function comp_dr_with_seg_len_pair. 2678 2679 Comparison function for sorting objects of dr_with_seg_len_pair_t 2680 so that we can combine aliasing checks in one scan. */ 2681 2682 static int 2683 comp_dr_with_seg_len_pair (const void *p1_, const void *p2_) 2684 { 2685 const dr_with_seg_len_pair_t* p1 = (const dr_with_seg_len_pair_t *) p1_; 2686 const dr_with_seg_len_pair_t* p2 = (const dr_with_seg_len_pair_t *) p2_; 2687 2688 const dr_with_seg_len &p11 = p1->first, 2689 &p12 = p1->second, 2690 &p21 = p2->first, 2691 &p22 = p2->second; 2692 2693 /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks 2694 if a and c have the same basic address snd step, and b and d have the same 2695 address and step. Therefore, if any a&c or b&d don't have the same address 2696 and step, we don't care the order of those two pairs after sorting. */ 2697 int comp_res; 2698 2699 if ((comp_res = compare_tree (DR_BASE_ADDRESS (p11.dr), 2700 DR_BASE_ADDRESS (p21.dr))) != 0) 2701 return comp_res; 2702 if ((comp_res = compare_tree (DR_BASE_ADDRESS (p12.dr), 2703 DR_BASE_ADDRESS (p22.dr))) != 0) 2704 return comp_res; 2705 if ((comp_res = compare_tree (DR_STEP (p11.dr), DR_STEP (p21.dr))) != 0) 2706 return comp_res; 2707 if ((comp_res = compare_tree (DR_STEP (p12.dr), DR_STEP (p22.dr))) != 0) 2708 return comp_res; 2709 if ((comp_res = compare_tree (p11.offset, p21.offset)) != 0) 2710 return comp_res; 2711 if ((comp_res = compare_tree (p12.offset, p22.offset)) != 0) 2712 return comp_res; 2713 2714 return 0; 2715 } 2716 2717 /* Function vect_vfa_segment_size. 2718 2719 Create an expression that computes the size of segment 2720 that will be accessed for a data reference. The functions takes into 2721 account that realignment loads may access one more vector. 2722 2723 Input: 2724 DR: The data reference. 2725 LENGTH_FACTOR: segment length to consider. 2726 2727 Return an expression whose value is the size of segment which will be 2728 accessed by DR. */ 2729 2730 static tree 2731 vect_vfa_segment_size (struct data_reference *dr, tree length_factor) 2732 { 2733 tree segment_length; 2734 2735 if (integer_zerop (DR_STEP (dr))) 2736 segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr))); 2737 else 2738 segment_length = size_binop (MULT_EXPR, 2739 fold_convert (sizetype, DR_STEP (dr)), 2740 fold_convert (sizetype, length_factor)); 2741 2742 if (vect_supportable_dr_alignment (dr, false) 2743 == dr_explicit_realign_optimized) 2744 { 2745 tree vector_size = TYPE_SIZE_UNIT 2746 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)))); 2747 2748 segment_length = size_binop (PLUS_EXPR, segment_length, vector_size); 2749 } 2750 return segment_length; 2751 } 2752 2753 /* Function vect_prune_runtime_alias_test_list. 2754 2755 Prune a list of ddrs to be tested at run-time by versioning for alias. 2756 Merge several alias checks into one if possible. 2757 Return FALSE if resulting list of ddrs is longer then allowed by 2758 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */ 2759 2760 bool 2761 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo) 2762 { 2763 vec<ddr_p> may_alias_ddrs = 2764 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo); 2765 vec<dr_with_seg_len_pair_t>& comp_alias_ddrs = 2766 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo); 2767 int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 2768 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo); 2769 2770 ddr_p ddr; 2771 unsigned int i; 2772 tree length_factor; 2773 2774 if (dump_enabled_p ()) 2775 dump_printf_loc (MSG_NOTE, vect_location, 2776 "=== vect_prune_runtime_alias_test_list ===\n"); 2777 2778 if (may_alias_ddrs.is_empty ()) 2779 return true; 2780 2781 /* Basically, for each pair of dependent data refs store_ptr_0 2782 and load_ptr_0, we create an expression: 2783 2784 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0) 2785 || (load_ptr_0 + load_segment_length_0) <= store_ptr_0)) 2786 2787 for aliasing checks. However, in some cases we can decrease 2788 the number of checks by combining two checks into one. For 2789 example, suppose we have another pair of data refs store_ptr_0 2790 and load_ptr_1, and if the following condition is satisfied: 2791 2792 load_ptr_0 < load_ptr_1 && 2793 load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0 2794 2795 (this condition means, in each iteration of vectorized loop, 2796 the accessed memory of store_ptr_0 cannot be between the memory 2797 of load_ptr_0 and load_ptr_1.) 2798 2799 we then can use only the following expression to finish the 2800 alising checks between store_ptr_0 & load_ptr_0 and 2801 store_ptr_0 & load_ptr_1: 2802 2803 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0) 2804 || (load_ptr_1 + load_segment_length_1 <= store_ptr_0)) 2805 2806 Note that we only consider that load_ptr_0 and load_ptr_1 have the 2807 same basic address. */ 2808 2809 comp_alias_ddrs.create (may_alias_ddrs.length ()); 2810 2811 /* First, we collect all data ref pairs for aliasing checks. */ 2812 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr) 2813 { 2814 struct data_reference *dr_a, *dr_b; 2815 gimple dr_group_first_a, dr_group_first_b; 2816 tree segment_length_a, segment_length_b; 2817 gimple stmt_a, stmt_b; 2818 2819 dr_a = DDR_A (ddr); 2820 stmt_a = DR_STMT (DDR_A (ddr)); 2821 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a)); 2822 if (dr_group_first_a) 2823 { 2824 stmt_a = dr_group_first_a; 2825 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a)); 2826 } 2827 2828 dr_b = DDR_B (ddr); 2829 stmt_b = DR_STMT (DDR_B (ddr)); 2830 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b)); 2831 if (dr_group_first_b) 2832 { 2833 stmt_b = dr_group_first_b; 2834 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b)); 2835 } 2836 2837 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)) 2838 length_factor = scalar_loop_iters; 2839 else 2840 length_factor = size_int (vect_factor); 2841 segment_length_a = vect_vfa_segment_size (dr_a, length_factor); 2842 segment_length_b = vect_vfa_segment_size (dr_b, length_factor); 2843 2844 dr_with_seg_len_pair_t dr_with_seg_len_pair 2845 (dr_with_seg_len (dr_a, segment_length_a), 2846 dr_with_seg_len (dr_b, segment_length_b)); 2847 2848 if (compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b)) > 0) 2849 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second); 2850 2851 comp_alias_ddrs.safe_push (dr_with_seg_len_pair); 2852 } 2853 2854 /* Second, we sort the collected data ref pairs so that we can scan 2855 them once to combine all possible aliasing checks. */ 2856 comp_alias_ddrs.qsort (comp_dr_with_seg_len_pair); 2857 2858 /* Third, we scan the sorted dr pairs and check if we can combine 2859 alias checks of two neighbouring dr pairs. */ 2860 for (size_t i = 1; i < comp_alias_ddrs.length (); ++i) 2861 { 2862 /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2). */ 2863 dr_with_seg_len *dr_a1 = &comp_alias_ddrs[i-1].first, 2864 *dr_b1 = &comp_alias_ddrs[i-1].second, 2865 *dr_a2 = &comp_alias_ddrs[i].first, 2866 *dr_b2 = &comp_alias_ddrs[i].second; 2867 2868 /* Remove duplicate data ref pairs. */ 2869 if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2) 2870 { 2871 if (dump_enabled_p ()) 2872 { 2873 dump_printf_loc (MSG_NOTE, vect_location, 2874 "found equal ranges "); 2875 dump_generic_expr (MSG_NOTE, TDF_SLIM, 2876 DR_REF (dr_a1->dr)); 2877 dump_printf (MSG_NOTE, ", "); 2878 dump_generic_expr (MSG_NOTE, TDF_SLIM, 2879 DR_REF (dr_b1->dr)); 2880 dump_printf (MSG_NOTE, " and "); 2881 dump_generic_expr (MSG_NOTE, TDF_SLIM, 2882 DR_REF (dr_a2->dr)); 2883 dump_printf (MSG_NOTE, ", "); 2884 dump_generic_expr (MSG_NOTE, TDF_SLIM, 2885 DR_REF (dr_b2->dr)); 2886 dump_printf (MSG_NOTE, "\n"); 2887 } 2888 2889 comp_alias_ddrs.ordered_remove (i--); 2890 continue; 2891 } 2892 2893 if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2) 2894 { 2895 /* We consider the case that DR_B1 and DR_B2 are same memrefs, 2896 and DR_A1 and DR_A2 are two consecutive memrefs. */ 2897 if (*dr_a1 == *dr_a2) 2898 { 2899 std::swap (dr_a1, dr_b1); 2900 std::swap (dr_a2, dr_b2); 2901 } 2902 2903 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr), 2904 DR_BASE_ADDRESS (dr_a2->dr), 2905 0) 2906 || !tree_fits_shwi_p (dr_a1->offset) 2907 || !tree_fits_shwi_p (dr_a2->offset)) 2908 continue; 2909 2910 /* Make sure dr_a1 starts left of dr_a2. */ 2911 if (tree_int_cst_lt (dr_a2->offset, dr_a1->offset)) 2912 std::swap (*dr_a1, *dr_a2); 2913 2914 unsigned HOST_WIDE_INT diff 2915 = tree_to_shwi (dr_a2->offset) - tree_to_shwi (dr_a1->offset); 2916 2917 2918 bool do_remove = false; 2919 2920 /* If the left segment does not extend beyond the start of the 2921 right segment the new segment length is that of the right 2922 plus the segment distance. */ 2923 if (tree_fits_uhwi_p (dr_a1->seg_len) 2924 && compare_tree_int (dr_a1->seg_len, diff) <= 0) 2925 { 2926 dr_a1->seg_len = size_binop (PLUS_EXPR, dr_a2->seg_len, 2927 size_int (diff)); 2928 do_remove = true; 2929 } 2930 /* Generally the new segment length is the maximum of the 2931 left segment size and the right segment size plus the distance. 2932 ??? We can also build tree MAX_EXPR here but it's not clear this 2933 is profitable. */ 2934 else if (tree_fits_uhwi_p (dr_a1->seg_len) 2935 && tree_fits_uhwi_p (dr_a2->seg_len)) 2936 { 2937 unsigned HOST_WIDE_INT seg_len_a1 = tree_to_uhwi (dr_a1->seg_len); 2938 unsigned HOST_WIDE_INT seg_len_a2 = tree_to_uhwi (dr_a2->seg_len); 2939 dr_a1->seg_len = size_int (MAX (seg_len_a1, diff + seg_len_a2)); 2940 do_remove = true; 2941 } 2942 /* Now we check if the following condition is satisfied: 2943 2944 DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B 2945 2946 where DIFF = DR_A2->OFFSET - DR_A1->OFFSET. However, 2947 SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we 2948 have to make a best estimation. We can get the minimum value 2949 of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B, 2950 then either of the following two conditions can guarantee the 2951 one above: 2952 2953 1: DIFF <= MIN_SEG_LEN_B 2954 2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B */ 2955 else 2956 { 2957 unsigned HOST_WIDE_INT min_seg_len_b 2958 = (tree_fits_uhwi_p (dr_b1->seg_len) 2959 ? tree_to_uhwi (dr_b1->seg_len) 2960 : vect_factor); 2961 2962 if (diff <= min_seg_len_b 2963 || (tree_fits_uhwi_p (dr_a1->seg_len) 2964 && diff - tree_to_uhwi (dr_a1->seg_len) < min_seg_len_b)) 2965 { 2966 dr_a1->seg_len = size_binop (PLUS_EXPR, 2967 dr_a2->seg_len, size_int (diff)); 2968 do_remove = true; 2969 } 2970 } 2971 2972 if (do_remove) 2973 { 2974 if (dump_enabled_p ()) 2975 { 2976 dump_printf_loc (MSG_NOTE, vect_location, 2977 "merging ranges for "); 2978 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a1->dr)); 2979 dump_printf (MSG_NOTE, ", "); 2980 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b1->dr)); 2981 dump_printf (MSG_NOTE, " and "); 2982 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a2->dr)); 2983 dump_printf (MSG_NOTE, ", "); 2984 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b2->dr)); 2985 dump_printf (MSG_NOTE, "\n"); 2986 } 2987 comp_alias_ddrs.ordered_remove (i--); 2988 } 2989 } 2990 } 2991 2992 dump_printf_loc (MSG_NOTE, vect_location, 2993 "improved number of alias checks from %d to %d\n", 2994 may_alias_ddrs.length (), comp_alias_ddrs.length ()); 2995 if ((int) comp_alias_ddrs.length () > 2996 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS)) 2997 return false; 2998 2999 return true; 3000 } 3001 3002 /* Check whether a non-affine read in stmt is suitable for gather load 3003 and if so, return a builtin decl for that operation. */ 3004 3005 tree 3006 vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep, 3007 tree *offp, int *scalep) 3008 { 3009 HOST_WIDE_INT scale = 1, pbitpos, pbitsize; 3010 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 3011 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 3012 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); 3013 tree offtype = NULL_TREE; 3014 tree decl, base, off; 3015 machine_mode pmode; 3016 int punsignedp, pvolatilep; 3017 3018 base = DR_REF (dr); 3019 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF, 3020 see if we can use the def stmt of the address. */ 3021 if (is_gimple_call (stmt) 3022 && gimple_call_internal_p (stmt) 3023 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD 3024 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE) 3025 && TREE_CODE (base) == MEM_REF 3026 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME 3027 && integer_zerop (TREE_OPERAND (base, 1)) 3028 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0))) 3029 { 3030 gimple def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0)); 3031 if (is_gimple_assign (def_stmt) 3032 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR) 3033 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0); 3034 } 3035 3036 /* The gather builtins need address of the form 3037 loop_invariant + vector * {1, 2, 4, 8} 3038 or 3039 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }. 3040 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture 3041 of loop invariants/SSA_NAMEs defined in the loop, with casts, 3042 multiplications and additions in it. To get a vector, we need 3043 a single SSA_NAME that will be defined in the loop and will 3044 contain everything that is not loop invariant and that can be 3045 vectorized. The following code attempts to find such a preexistng 3046 SSA_NAME OFF and put the loop invariants into a tree BASE 3047 that can be gimplified before the loop. */ 3048 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, 3049 &pmode, &punsignedp, &pvolatilep, false); 3050 gcc_assert (base != NULL_TREE && (pbitpos % BITS_PER_UNIT) == 0); 3051 3052 if (TREE_CODE (base) == MEM_REF) 3053 { 3054 if (!integer_zerop (TREE_OPERAND (base, 1))) 3055 { 3056 if (off == NULL_TREE) 3057 { 3058 offset_int moff = mem_ref_offset (base); 3059 off = wide_int_to_tree (sizetype, moff); 3060 } 3061 else 3062 off = size_binop (PLUS_EXPR, off, 3063 fold_convert (sizetype, TREE_OPERAND (base, 1))); 3064 } 3065 base = TREE_OPERAND (base, 0); 3066 } 3067 else 3068 base = build_fold_addr_expr (base); 3069 3070 if (off == NULL_TREE) 3071 off = size_zero_node; 3072 3073 /* If base is not loop invariant, either off is 0, then we start with just 3074 the constant offset in the loop invariant BASE and continue with base 3075 as OFF, otherwise give up. 3076 We could handle that case by gimplifying the addition of base + off 3077 into some SSA_NAME and use that as off, but for now punt. */ 3078 if (!expr_invariant_in_loop_p (loop, base)) 3079 { 3080 if (!integer_zerop (off)) 3081 return NULL_TREE; 3082 off = base; 3083 base = size_int (pbitpos / BITS_PER_UNIT); 3084 } 3085 /* Otherwise put base + constant offset into the loop invariant BASE 3086 and continue with OFF. */ 3087 else 3088 { 3089 base = fold_convert (sizetype, base); 3090 base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT)); 3091 } 3092 3093 /* OFF at this point may be either a SSA_NAME or some tree expression 3094 from get_inner_reference. Try to peel off loop invariants from it 3095 into BASE as long as possible. */ 3096 STRIP_NOPS (off); 3097 while (offtype == NULL_TREE) 3098 { 3099 enum tree_code code; 3100 tree op0, op1, add = NULL_TREE; 3101 3102 if (TREE_CODE (off) == SSA_NAME) 3103 { 3104 gimple def_stmt = SSA_NAME_DEF_STMT (off); 3105 3106 if (expr_invariant_in_loop_p (loop, off)) 3107 return NULL_TREE; 3108 3109 if (gimple_code (def_stmt) != GIMPLE_ASSIGN) 3110 break; 3111 3112 op0 = gimple_assign_rhs1 (def_stmt); 3113 code = gimple_assign_rhs_code (def_stmt); 3114 op1 = gimple_assign_rhs2 (def_stmt); 3115 } 3116 else 3117 { 3118 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS) 3119 return NULL_TREE; 3120 code = TREE_CODE (off); 3121 extract_ops_from_tree (off, &code, &op0, &op1); 3122 } 3123 switch (code) 3124 { 3125 case POINTER_PLUS_EXPR: 3126 case PLUS_EXPR: 3127 if (expr_invariant_in_loop_p (loop, op0)) 3128 { 3129 add = op0; 3130 off = op1; 3131 do_add: 3132 add = fold_convert (sizetype, add); 3133 if (scale != 1) 3134 add = size_binop (MULT_EXPR, add, size_int (scale)); 3135 base = size_binop (PLUS_EXPR, base, add); 3136 continue; 3137 } 3138 if (expr_invariant_in_loop_p (loop, op1)) 3139 { 3140 add = op1; 3141 off = op0; 3142 goto do_add; 3143 } 3144 break; 3145 case MINUS_EXPR: 3146 if (expr_invariant_in_loop_p (loop, op1)) 3147 { 3148 add = fold_convert (sizetype, op1); 3149 add = size_binop (MINUS_EXPR, size_zero_node, add); 3150 off = op0; 3151 goto do_add; 3152 } 3153 break; 3154 case MULT_EXPR: 3155 if (scale == 1 && tree_fits_shwi_p (op1)) 3156 { 3157 scale = tree_to_shwi (op1); 3158 off = op0; 3159 continue; 3160 } 3161 break; 3162 case SSA_NAME: 3163 off = op0; 3164 continue; 3165 CASE_CONVERT: 3166 if (!POINTER_TYPE_P (TREE_TYPE (op0)) 3167 && !INTEGRAL_TYPE_P (TREE_TYPE (op0))) 3168 break; 3169 if (TYPE_PRECISION (TREE_TYPE (op0)) 3170 == TYPE_PRECISION (TREE_TYPE (off))) 3171 { 3172 off = op0; 3173 continue; 3174 } 3175 if (TYPE_PRECISION (TREE_TYPE (op0)) 3176 < TYPE_PRECISION (TREE_TYPE (off))) 3177 { 3178 off = op0; 3179 offtype = TREE_TYPE (off); 3180 STRIP_NOPS (off); 3181 continue; 3182 } 3183 break; 3184 default: 3185 break; 3186 } 3187 break; 3188 } 3189 3190 /* If at the end OFF still isn't a SSA_NAME or isn't 3191 defined in the loop, punt. */ 3192 if (TREE_CODE (off) != SSA_NAME 3193 || expr_invariant_in_loop_p (loop, off)) 3194 return NULL_TREE; 3195 3196 if (offtype == NULL_TREE) 3197 offtype = TREE_TYPE (off); 3198 3199 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info), 3200 offtype, scale); 3201 if (decl == NULL_TREE) 3202 return NULL_TREE; 3203 3204 if (basep) 3205 *basep = base; 3206 if (offp) 3207 *offp = off; 3208 if (scalep) 3209 *scalep = scale; 3210 return decl; 3211 } 3212 3213 /* Function vect_analyze_data_refs. 3214 3215 Find all the data references in the loop or basic block. 3216 3217 The general structure of the analysis of data refs in the vectorizer is as 3218 follows: 3219 1- vect_analyze_data_refs(loop/bb): call 3220 compute_data_dependences_for_loop/bb to find and analyze all data-refs 3221 in the loop/bb and their dependences. 3222 2- vect_analyze_dependences(): apply dependence testing using ddrs. 3223 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok. 3224 4- vect_analyze_drs_access(): check that ref_stmt.step is ok. 3225 3226 */ 3227 3228 bool 3229 vect_analyze_data_refs (loop_vec_info loop_vinfo, 3230 bb_vec_info bb_vinfo, 3231 int *min_vf, unsigned *n_stmts) 3232 { 3233 struct loop *loop = NULL; 3234 basic_block bb = NULL; 3235 unsigned int i; 3236 vec<data_reference_p> datarefs; 3237 struct data_reference *dr; 3238 tree scalar_type; 3239 3240 if (dump_enabled_p ()) 3241 dump_printf_loc (MSG_NOTE, vect_location, 3242 "=== vect_analyze_data_refs ===\n"); 3243 3244 if (loop_vinfo) 3245 { 3246 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); 3247 3248 loop = LOOP_VINFO_LOOP (loop_vinfo); 3249 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo); 3250 if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo))) 3251 { 3252 if (dump_enabled_p ()) 3253 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3254 "not vectorized: loop contains function calls" 3255 " or data references that cannot be analyzed\n"); 3256 return false; 3257 } 3258 3259 for (i = 0; i < loop->num_nodes; i++) 3260 { 3261 gimple_stmt_iterator gsi; 3262 3263 for (gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi)) 3264 { 3265 gimple stmt = gsi_stmt (gsi); 3266 if (is_gimple_debug (stmt)) 3267 continue; 3268 ++*n_stmts; 3269 if (!find_data_references_in_stmt (loop, stmt, &datarefs)) 3270 { 3271 if (is_gimple_call (stmt) && loop->safelen) 3272 { 3273 tree fndecl = gimple_call_fndecl (stmt), op; 3274 if (fndecl != NULL_TREE) 3275 { 3276 struct cgraph_node *node = cgraph_node::get (fndecl); 3277 if (node != NULL && node->simd_clones != NULL) 3278 { 3279 unsigned int j, n = gimple_call_num_args (stmt); 3280 for (j = 0; j < n; j++) 3281 { 3282 op = gimple_call_arg (stmt, j); 3283 if (DECL_P (op) 3284 || (REFERENCE_CLASS_P (op) 3285 && get_base_address (op))) 3286 break; 3287 } 3288 op = gimple_call_lhs (stmt); 3289 /* Ignore #pragma omp declare simd functions 3290 if they don't have data references in the 3291 call stmt itself. */ 3292 if (j == n 3293 && !(op 3294 && (DECL_P (op) 3295 || (REFERENCE_CLASS_P (op) 3296 && get_base_address (op))))) 3297 continue; 3298 } 3299 } 3300 } 3301 LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs; 3302 if (dump_enabled_p ()) 3303 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3304 "not vectorized: loop contains function " 3305 "calls or data references that cannot " 3306 "be analyzed\n"); 3307 return false; 3308 } 3309 } 3310 } 3311 3312 LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs; 3313 } 3314 else 3315 { 3316 gimple_stmt_iterator gsi; 3317 3318 bb = BB_VINFO_BB (bb_vinfo); 3319 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi)) 3320 { 3321 gimple stmt = gsi_stmt (gsi); 3322 if (is_gimple_debug (stmt)) 3323 continue; 3324 ++*n_stmts; 3325 if (!find_data_references_in_stmt (NULL, stmt, 3326 &BB_VINFO_DATAREFS (bb_vinfo))) 3327 { 3328 /* Mark the rest of the basic-block as unvectorizable. */ 3329 for (; !gsi_end_p (gsi); gsi_next (&gsi)) 3330 { 3331 stmt = gsi_stmt (gsi); 3332 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)) = false; 3333 } 3334 break; 3335 } 3336 } 3337 3338 datarefs = BB_VINFO_DATAREFS (bb_vinfo); 3339 } 3340 3341 /* Go through the data-refs, check that the analysis succeeded. Update 3342 pointer from stmt_vec_info struct to DR and vectype. */ 3343 3344 FOR_EACH_VEC_ELT (datarefs, i, dr) 3345 { 3346 gimple stmt; 3347 stmt_vec_info stmt_info; 3348 tree base, offset, init; 3349 bool gather = false; 3350 bool simd_lane_access = false; 3351 int vf; 3352 3353 again: 3354 if (!dr || !DR_REF (dr)) 3355 { 3356 if (dump_enabled_p ()) 3357 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3358 "not vectorized: unhandled data-ref\n"); 3359 return false; 3360 } 3361 3362 stmt = DR_STMT (dr); 3363 stmt_info = vinfo_for_stmt (stmt); 3364 3365 /* Discard clobbers from the dataref vector. We will remove 3366 clobber stmts during vectorization. */ 3367 if (gimple_clobber_p (stmt)) 3368 { 3369 free_data_ref (dr); 3370 if (i == datarefs.length () - 1) 3371 { 3372 datarefs.pop (); 3373 break; 3374 } 3375 datarefs.ordered_remove (i); 3376 dr = datarefs[i]; 3377 goto again; 3378 } 3379 3380 /* Check that analysis of the data-ref succeeded. */ 3381 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr) 3382 || !DR_STEP (dr)) 3383 { 3384 bool maybe_gather 3385 = DR_IS_READ (dr) 3386 && !TREE_THIS_VOLATILE (DR_REF (dr)) 3387 && targetm.vectorize.builtin_gather != NULL; 3388 bool maybe_simd_lane_access 3389 = loop_vinfo && loop->simduid; 3390 3391 /* If target supports vector gather loads, or if this might be 3392 a SIMD lane access, see if they can't be used. */ 3393 if (loop_vinfo 3394 && (maybe_gather || maybe_simd_lane_access) 3395 && !nested_in_vect_loop_p (loop, stmt)) 3396 { 3397 struct data_reference *newdr 3398 = create_data_ref (NULL, loop_containing_stmt (stmt), 3399 DR_REF (dr), stmt, true); 3400 gcc_assert (newdr != NULL && DR_REF (newdr)); 3401 if (DR_BASE_ADDRESS (newdr) 3402 && DR_OFFSET (newdr) 3403 && DR_INIT (newdr) 3404 && DR_STEP (newdr) 3405 && integer_zerop (DR_STEP (newdr))) 3406 { 3407 if (maybe_simd_lane_access) 3408 { 3409 tree off = DR_OFFSET (newdr); 3410 STRIP_NOPS (off); 3411 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST 3412 && TREE_CODE (off) == MULT_EXPR 3413 && tree_fits_uhwi_p (TREE_OPERAND (off, 1))) 3414 { 3415 tree step = TREE_OPERAND (off, 1); 3416 off = TREE_OPERAND (off, 0); 3417 STRIP_NOPS (off); 3418 if (CONVERT_EXPR_P (off) 3419 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 3420 0))) 3421 < TYPE_PRECISION (TREE_TYPE (off))) 3422 off = TREE_OPERAND (off, 0); 3423 if (TREE_CODE (off) == SSA_NAME) 3424 { 3425 gimple def = SSA_NAME_DEF_STMT (off); 3426 tree reft = TREE_TYPE (DR_REF (newdr)); 3427 if (is_gimple_call (def) 3428 && gimple_call_internal_p (def) 3429 && (gimple_call_internal_fn (def) 3430 == IFN_GOMP_SIMD_LANE)) 3431 { 3432 tree arg = gimple_call_arg (def, 0); 3433 gcc_assert (TREE_CODE (arg) == SSA_NAME); 3434 arg = SSA_NAME_VAR (arg); 3435 if (arg == loop->simduid 3436 /* For now. */ 3437 && tree_int_cst_equal 3438 (TYPE_SIZE_UNIT (reft), 3439 step)) 3440 { 3441 DR_OFFSET (newdr) = ssize_int (0); 3442 DR_STEP (newdr) = step; 3443 DR_ALIGNED_TO (newdr) 3444 = size_int (BIGGEST_ALIGNMENT); 3445 dr = newdr; 3446 simd_lane_access = true; 3447 } 3448 } 3449 } 3450 } 3451 } 3452 if (!simd_lane_access && maybe_gather) 3453 { 3454 dr = newdr; 3455 gather = true; 3456 } 3457 } 3458 if (!gather && !simd_lane_access) 3459 free_data_ref (newdr); 3460 } 3461 3462 if (!gather && !simd_lane_access) 3463 { 3464 if (dump_enabled_p ()) 3465 { 3466 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3467 "not vectorized: data ref analysis " 3468 "failed "); 3469 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 3470 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 3471 } 3472 3473 if (bb_vinfo) 3474 break; 3475 3476 return false; 3477 } 3478 } 3479 3480 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST) 3481 { 3482 if (dump_enabled_p ()) 3483 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3484 "not vectorized: base addr of dr is a " 3485 "constant\n"); 3486 3487 if (bb_vinfo) 3488 break; 3489 3490 if (gather || simd_lane_access) 3491 free_data_ref (dr); 3492 return false; 3493 } 3494 3495 if (TREE_THIS_VOLATILE (DR_REF (dr))) 3496 { 3497 if (dump_enabled_p ()) 3498 { 3499 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3500 "not vectorized: volatile type "); 3501 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 3502 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 3503 } 3504 3505 if (bb_vinfo) 3506 break; 3507 3508 return false; 3509 } 3510 3511 if (stmt_can_throw_internal (stmt)) 3512 { 3513 if (dump_enabled_p ()) 3514 { 3515 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3516 "not vectorized: statement can throw an " 3517 "exception "); 3518 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 3519 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 3520 } 3521 3522 if (bb_vinfo) 3523 break; 3524 3525 if (gather || simd_lane_access) 3526 free_data_ref (dr); 3527 return false; 3528 } 3529 3530 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF 3531 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1))) 3532 { 3533 if (dump_enabled_p ()) 3534 { 3535 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3536 "not vectorized: statement is bitfield " 3537 "access "); 3538 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 3539 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 3540 } 3541 3542 if (bb_vinfo) 3543 break; 3544 3545 if (gather || simd_lane_access) 3546 free_data_ref (dr); 3547 return false; 3548 } 3549 3550 base = unshare_expr (DR_BASE_ADDRESS (dr)); 3551 offset = unshare_expr (DR_OFFSET (dr)); 3552 init = unshare_expr (DR_INIT (dr)); 3553 3554 if (is_gimple_call (stmt) 3555 && (!gimple_call_internal_p (stmt) 3556 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD 3557 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE))) 3558 { 3559 if (dump_enabled_p ()) 3560 { 3561 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3562 "not vectorized: dr in a call "); 3563 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 3564 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 3565 } 3566 3567 if (bb_vinfo) 3568 break; 3569 3570 if (gather || simd_lane_access) 3571 free_data_ref (dr); 3572 return false; 3573 } 3574 3575 /* Update DR field in stmt_vec_info struct. */ 3576 3577 /* If the dataref is in an inner-loop of the loop that is considered for 3578 for vectorization, we also want to analyze the access relative to 3579 the outer-loop (DR contains information only relative to the 3580 inner-most enclosing loop). We do that by building a reference to the 3581 first location accessed by the inner-loop, and analyze it relative to 3582 the outer-loop. */ 3583 if (loop && nested_in_vect_loop_p (loop, stmt)) 3584 { 3585 tree outer_step, outer_base, outer_init; 3586 HOST_WIDE_INT pbitsize, pbitpos; 3587 tree poffset; 3588 machine_mode pmode; 3589 int punsignedp, pvolatilep; 3590 affine_iv base_iv, offset_iv; 3591 tree dinit; 3592 3593 /* Build a reference to the first location accessed by the 3594 inner-loop: *(BASE+INIT). (The first location is actually 3595 BASE+INIT+OFFSET, but we add OFFSET separately later). */ 3596 tree inner_base = build_fold_indirect_ref 3597 (fold_build_pointer_plus (base, init)); 3598 3599 if (dump_enabled_p ()) 3600 { 3601 dump_printf_loc (MSG_NOTE, vect_location, 3602 "analyze in outer-loop: "); 3603 dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base); 3604 dump_printf (MSG_NOTE, "\n"); 3605 } 3606 3607 outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos, 3608 &poffset, &pmode, &punsignedp, &pvolatilep, false); 3609 gcc_assert (outer_base != NULL_TREE); 3610 3611 if (pbitpos % BITS_PER_UNIT != 0) 3612 { 3613 if (dump_enabled_p ()) 3614 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3615 "failed: bit offset alignment.\n"); 3616 return false; 3617 } 3618 3619 outer_base = build_fold_addr_expr (outer_base); 3620 if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base, 3621 &base_iv, false)) 3622 { 3623 if (dump_enabled_p ()) 3624 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3625 "failed: evolution of base is not affine.\n"); 3626 return false; 3627 } 3628 3629 if (offset) 3630 { 3631 if (poffset) 3632 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset, 3633 poffset); 3634 else 3635 poffset = offset; 3636 } 3637 3638 if (!poffset) 3639 { 3640 offset_iv.base = ssize_int (0); 3641 offset_iv.step = ssize_int (0); 3642 } 3643 else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset, 3644 &offset_iv, false)) 3645 { 3646 if (dump_enabled_p ()) 3647 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3648 "evolution of offset is not affine.\n"); 3649 return false; 3650 } 3651 3652 outer_init = ssize_int (pbitpos / BITS_PER_UNIT); 3653 split_constant_offset (base_iv.base, &base_iv.base, &dinit); 3654 outer_init = size_binop (PLUS_EXPR, outer_init, dinit); 3655 split_constant_offset (offset_iv.base, &offset_iv.base, &dinit); 3656 outer_init = size_binop (PLUS_EXPR, outer_init, dinit); 3657 3658 outer_step = size_binop (PLUS_EXPR, 3659 fold_convert (ssizetype, base_iv.step), 3660 fold_convert (ssizetype, offset_iv.step)); 3661 3662 STMT_VINFO_DR_STEP (stmt_info) = outer_step; 3663 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */ 3664 STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base; 3665 STMT_VINFO_DR_INIT (stmt_info) = outer_init; 3666 STMT_VINFO_DR_OFFSET (stmt_info) = 3667 fold_convert (ssizetype, offset_iv.base); 3668 STMT_VINFO_DR_ALIGNED_TO (stmt_info) = 3669 size_int (highest_pow2_factor (offset_iv.base)); 3670 3671 if (dump_enabled_p ()) 3672 { 3673 dump_printf_loc (MSG_NOTE, vect_location, 3674 "\touter base_address: "); 3675 dump_generic_expr (MSG_NOTE, TDF_SLIM, 3676 STMT_VINFO_DR_BASE_ADDRESS (stmt_info)); 3677 dump_printf (MSG_NOTE, "\n\touter offset from base address: "); 3678 dump_generic_expr (MSG_NOTE, TDF_SLIM, 3679 STMT_VINFO_DR_OFFSET (stmt_info)); 3680 dump_printf (MSG_NOTE, 3681 "\n\touter constant offset from base address: "); 3682 dump_generic_expr (MSG_NOTE, TDF_SLIM, 3683 STMT_VINFO_DR_INIT (stmt_info)); 3684 dump_printf (MSG_NOTE, "\n\touter step: "); 3685 dump_generic_expr (MSG_NOTE, TDF_SLIM, 3686 STMT_VINFO_DR_STEP (stmt_info)); 3687 dump_printf (MSG_NOTE, "\n\touter aligned to: "); 3688 dump_generic_expr (MSG_NOTE, TDF_SLIM, 3689 STMT_VINFO_DR_ALIGNED_TO (stmt_info)); 3690 dump_printf (MSG_NOTE, "\n"); 3691 } 3692 } 3693 3694 if (STMT_VINFO_DATA_REF (stmt_info)) 3695 { 3696 if (dump_enabled_p ()) 3697 { 3698 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3699 "not vectorized: more than one data ref " 3700 "in stmt: "); 3701 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 3702 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 3703 } 3704 3705 if (bb_vinfo) 3706 break; 3707 3708 if (gather || simd_lane_access) 3709 free_data_ref (dr); 3710 return false; 3711 } 3712 3713 STMT_VINFO_DATA_REF (stmt_info) = dr; 3714 if (simd_lane_access) 3715 { 3716 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true; 3717 free_data_ref (datarefs[i]); 3718 datarefs[i] = dr; 3719 } 3720 3721 /* Set vectype for STMT. */ 3722 scalar_type = TREE_TYPE (DR_REF (dr)); 3723 STMT_VINFO_VECTYPE (stmt_info) 3724 = get_vectype_for_scalar_type (scalar_type); 3725 if (!STMT_VINFO_VECTYPE (stmt_info)) 3726 { 3727 if (dump_enabled_p ()) 3728 { 3729 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3730 "not vectorized: no vectype for stmt: "); 3731 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 3732 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: "); 3733 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS, 3734 scalar_type); 3735 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 3736 } 3737 3738 if (bb_vinfo) 3739 break; 3740 3741 if (gather || simd_lane_access) 3742 { 3743 STMT_VINFO_DATA_REF (stmt_info) = NULL; 3744 if (gather) 3745 free_data_ref (dr); 3746 } 3747 return false; 3748 } 3749 else 3750 { 3751 if (dump_enabled_p ()) 3752 { 3753 dump_printf_loc (MSG_NOTE, vect_location, 3754 "got vectype for stmt: "); 3755 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0); 3756 dump_generic_expr (MSG_NOTE, TDF_SLIM, 3757 STMT_VINFO_VECTYPE (stmt_info)); 3758 dump_printf (MSG_NOTE, "\n"); 3759 } 3760 } 3761 3762 /* Adjust the minimal vectorization factor according to the 3763 vector type. */ 3764 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)); 3765 if (vf > *min_vf) 3766 *min_vf = vf; 3767 3768 if (gather) 3769 { 3770 tree off; 3771 3772 gather = 0 != vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL); 3773 if (gather 3774 && get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE) 3775 gather = false; 3776 if (!gather) 3777 { 3778 STMT_VINFO_DATA_REF (stmt_info) = NULL; 3779 free_data_ref (dr); 3780 if (dump_enabled_p ()) 3781 { 3782 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3783 "not vectorized: not suitable for gather " 3784 "load "); 3785 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 3786 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 3787 } 3788 return false; 3789 } 3790 3791 datarefs[i] = dr; 3792 STMT_VINFO_GATHER_P (stmt_info) = true; 3793 } 3794 else if (loop_vinfo 3795 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST) 3796 { 3797 if (nested_in_vect_loop_p (loop, stmt) 3798 || !DR_IS_READ (dr)) 3799 { 3800 if (dump_enabled_p ()) 3801 { 3802 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3803 "not vectorized: not suitable for strided " 3804 "load "); 3805 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 3806 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 3807 } 3808 return false; 3809 } 3810 STMT_VINFO_STRIDE_LOAD_P (stmt_info) = true; 3811 } 3812 } 3813 3814 /* If we stopped analysis at the first dataref we could not analyze 3815 when trying to vectorize a basic-block mark the rest of the datarefs 3816 as not vectorizable and truncate the vector of datarefs. That 3817 avoids spending useless time in analyzing their dependence. */ 3818 if (i != datarefs.length ()) 3819 { 3820 gcc_assert (bb_vinfo != NULL); 3821 for (unsigned j = i; j < datarefs.length (); ++j) 3822 { 3823 data_reference_p dr = datarefs[j]; 3824 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false; 3825 free_data_ref (dr); 3826 } 3827 datarefs.truncate (i); 3828 } 3829 3830 return true; 3831 } 3832 3833 3834 /* Function vect_get_new_vect_var. 3835 3836 Returns a name for a new variable. The current naming scheme appends the 3837 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to 3838 the name of vectorizer generated variables, and appends that to NAME if 3839 provided. */ 3840 3841 tree 3842 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name) 3843 { 3844 const char *prefix; 3845 tree new_vect_var; 3846 3847 switch (var_kind) 3848 { 3849 case vect_simple_var: 3850 prefix = "vect"; 3851 break; 3852 case vect_scalar_var: 3853 prefix = "stmp"; 3854 break; 3855 case vect_pointer_var: 3856 prefix = "vectp"; 3857 break; 3858 default: 3859 gcc_unreachable (); 3860 } 3861 3862 if (name) 3863 { 3864 char* tmp = concat (prefix, "_", name, NULL); 3865 new_vect_var = create_tmp_reg (type, tmp); 3866 free (tmp); 3867 } 3868 else 3869 new_vect_var = create_tmp_reg (type, prefix); 3870 3871 return new_vect_var; 3872 } 3873 3874 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */ 3875 3876 static void 3877 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr, 3878 stmt_vec_info stmt_info) 3879 { 3880 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr)); 3881 unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info)); 3882 int misalign = DR_MISALIGNMENT (dr); 3883 if (misalign == -1) 3884 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name)); 3885 else 3886 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), align, misalign); 3887 } 3888 3889 /* Function vect_create_addr_base_for_vector_ref. 3890 3891 Create an expression that computes the address of the first memory location 3892 that will be accessed for a data reference. 3893 3894 Input: 3895 STMT: The statement containing the data reference. 3896 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list. 3897 OFFSET: Optional. If supplied, it is be added to the initial address. 3898 LOOP: Specify relative to which loop-nest should the address be computed. 3899 For example, when the dataref is in an inner-loop nested in an 3900 outer-loop that is now being vectorized, LOOP can be either the 3901 outer-loop, or the inner-loop. The first memory location accessed 3902 by the following dataref ('in' points to short): 3903 3904 for (i=0; i<N; i++) 3905 for (j=0; j<M; j++) 3906 s += in[i+j] 3907 3908 is as follows: 3909 if LOOP=i_loop: &in (relative to i_loop) 3910 if LOOP=j_loop: &in+i*2B (relative to j_loop) 3911 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the 3912 initial address. Unlike OFFSET, which is number of elements to 3913 be added, BYTE_OFFSET is measured in bytes. 3914 3915 Output: 3916 1. Return an SSA_NAME whose value is the address of the memory location of 3917 the first vector of the data reference. 3918 2. If new_stmt_list is not NULL_TREE after return then the caller must insert 3919 these statement(s) which define the returned SSA_NAME. 3920 3921 FORNOW: We are only handling array accesses with step 1. */ 3922 3923 tree 3924 vect_create_addr_base_for_vector_ref (gimple stmt, 3925 gimple_seq *new_stmt_list, 3926 tree offset, 3927 struct loop *loop, 3928 tree byte_offset) 3929 { 3930 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 3931 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); 3932 tree data_ref_base; 3933 const char *base_name; 3934 tree addr_base; 3935 tree dest; 3936 gimple_seq seq = NULL; 3937 tree base_offset; 3938 tree init; 3939 tree vect_ptr_type; 3940 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr))); 3941 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 3942 3943 if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father) 3944 { 3945 struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo); 3946 3947 gcc_assert (nested_in_vect_loop_p (outer_loop, stmt)); 3948 3949 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info)); 3950 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info)); 3951 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info)); 3952 } 3953 else 3954 { 3955 data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr)); 3956 base_offset = unshare_expr (DR_OFFSET (dr)); 3957 init = unshare_expr (DR_INIT (dr)); 3958 } 3959 3960 if (loop_vinfo) 3961 base_name = get_name (data_ref_base); 3962 else 3963 { 3964 base_offset = ssize_int (0); 3965 init = ssize_int (0); 3966 base_name = get_name (DR_REF (dr)); 3967 } 3968 3969 /* Create base_offset */ 3970 base_offset = size_binop (PLUS_EXPR, 3971 fold_convert (sizetype, base_offset), 3972 fold_convert (sizetype, init)); 3973 3974 if (offset) 3975 { 3976 offset = fold_build2 (MULT_EXPR, sizetype, 3977 fold_convert (sizetype, offset), step); 3978 base_offset = fold_build2 (PLUS_EXPR, sizetype, 3979 base_offset, offset); 3980 } 3981 if (byte_offset) 3982 { 3983 byte_offset = fold_convert (sizetype, byte_offset); 3984 base_offset = fold_build2 (PLUS_EXPR, sizetype, 3985 base_offset, byte_offset); 3986 } 3987 3988 /* base + base_offset */ 3989 if (loop_vinfo) 3990 addr_base = fold_build_pointer_plus (data_ref_base, base_offset); 3991 else 3992 { 3993 addr_base = build1 (ADDR_EXPR, 3994 build_pointer_type (TREE_TYPE (DR_REF (dr))), 3995 unshare_expr (DR_REF (dr))); 3996 } 3997 3998 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info)); 3999 addr_base = fold_convert (vect_ptr_type, addr_base); 4000 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name); 4001 addr_base = force_gimple_operand (addr_base, &seq, false, dest); 4002 gimple_seq_add_seq (new_stmt_list, seq); 4003 4004 if (DR_PTR_INFO (dr) 4005 && TREE_CODE (addr_base) == SSA_NAME) 4006 { 4007 vect_duplicate_ssa_name_ptr_info (addr_base, dr, stmt_info); 4008 if (offset || byte_offset) 4009 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base)); 4010 } 4011 4012 if (dump_enabled_p ()) 4013 { 4014 dump_printf_loc (MSG_NOTE, vect_location, "created "); 4015 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base); 4016 dump_printf (MSG_NOTE, "\n"); 4017 } 4018 4019 return addr_base; 4020 } 4021 4022 4023 /* Function vect_create_data_ref_ptr. 4024 4025 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first 4026 location accessed in the loop by STMT, along with the def-use update 4027 chain to appropriately advance the pointer through the loop iterations. 4028 Also set aliasing information for the pointer. This pointer is used by 4029 the callers to this function to create a memory reference expression for 4030 vector load/store access. 4031 4032 Input: 4033 1. STMT: a stmt that references memory. Expected to be of the form 4034 GIMPLE_ASSIGN <name, data-ref> or 4035 GIMPLE_ASSIGN <data-ref, name>. 4036 2. AGGR_TYPE: the type of the reference, which should be either a vector 4037 or an array. 4038 3. AT_LOOP: the loop where the vector memref is to be created. 4039 4. OFFSET (optional): an offset to be added to the initial address accessed 4040 by the data-ref in STMT. 4041 5. BSI: location where the new stmts are to be placed if there is no loop 4042 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain 4043 pointing to the initial address. 4044 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added 4045 to the initial address accessed by the data-ref in STMT. This is 4046 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET 4047 in bytes. 4048 4049 Output: 4050 1. Declare a new ptr to vector_type, and have it point to the base of the 4051 data reference (initial addressed accessed by the data reference). 4052 For example, for vector of type V8HI, the following code is generated: 4053 4054 v8hi *ap; 4055 ap = (v8hi *)initial_address; 4056 4057 if OFFSET is not supplied: 4058 initial_address = &a[init]; 4059 if OFFSET is supplied: 4060 initial_address = &a[init + OFFSET]; 4061 if BYTE_OFFSET is supplied: 4062 initial_address = &a[init] + BYTE_OFFSET; 4063 4064 Return the initial_address in INITIAL_ADDRESS. 4065 4066 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also 4067 update the pointer in each iteration of the loop. 4068 4069 Return the increment stmt that updates the pointer in PTR_INCR. 4070 4071 3. Set INV_P to true if the access pattern of the data reference in the 4072 vectorized loop is invariant. Set it to false otherwise. 4073 4074 4. Return the pointer. */ 4075 4076 tree 4077 vect_create_data_ref_ptr (gimple stmt, tree aggr_type, struct loop *at_loop, 4078 tree offset, tree *initial_address, 4079 gimple_stmt_iterator *gsi, gimple *ptr_incr, 4080 bool only_init, bool *inv_p, tree byte_offset) 4081 { 4082 const char *base_name; 4083 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 4084 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 4085 struct loop *loop = NULL; 4086 bool nested_in_vect_loop = false; 4087 struct loop *containing_loop = NULL; 4088 tree aggr_ptr_type; 4089 tree aggr_ptr; 4090 tree new_temp; 4091 gimple vec_stmt; 4092 gimple_seq new_stmt_list = NULL; 4093 edge pe = NULL; 4094 basic_block new_bb; 4095 tree aggr_ptr_init; 4096 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); 4097 tree aptr; 4098 gimple_stmt_iterator incr_gsi; 4099 bool insert_after; 4100 tree indx_before_incr, indx_after_incr; 4101 gimple incr; 4102 tree step; 4103 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info); 4104 4105 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE 4106 || TREE_CODE (aggr_type) == VECTOR_TYPE); 4107 4108 if (loop_vinfo) 4109 { 4110 loop = LOOP_VINFO_LOOP (loop_vinfo); 4111 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt); 4112 containing_loop = (gimple_bb (stmt))->loop_father; 4113 pe = loop_preheader_edge (loop); 4114 } 4115 else 4116 { 4117 gcc_assert (bb_vinfo); 4118 only_init = true; 4119 *ptr_incr = NULL; 4120 } 4121 4122 /* Check the step (evolution) of the load in LOOP, and record 4123 whether it's invariant. */ 4124 if (nested_in_vect_loop) 4125 step = STMT_VINFO_DR_STEP (stmt_info); 4126 else 4127 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info)); 4128 4129 if (integer_zerop (step)) 4130 *inv_p = true; 4131 else 4132 *inv_p = false; 4133 4134 /* Create an expression for the first address accessed by this load 4135 in LOOP. */ 4136 base_name = get_name (DR_BASE_ADDRESS (dr)); 4137 4138 if (dump_enabled_p ()) 4139 { 4140 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr)); 4141 dump_printf_loc (MSG_NOTE, vect_location, 4142 "create %s-pointer variable to type: ", 4143 get_tree_code_name (TREE_CODE (aggr_type))); 4144 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type); 4145 if (TREE_CODE (dr_base_type) == ARRAY_TYPE) 4146 dump_printf (MSG_NOTE, " vectorizing an array ref: "); 4147 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE) 4148 dump_printf (MSG_NOTE, " vectorizing a vector ref: "); 4149 else if (TREE_CODE (dr_base_type) == RECORD_TYPE) 4150 dump_printf (MSG_NOTE, " vectorizing a record based array ref: "); 4151 else 4152 dump_printf (MSG_NOTE, " vectorizing a pointer ref: "); 4153 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr)); 4154 dump_printf (MSG_NOTE, "\n"); 4155 } 4156 4157 /* (1) Create the new aggregate-pointer variable. 4158 Vector and array types inherit the alias set of their component 4159 type by default so we need to use a ref-all pointer if the data 4160 reference does not conflict with the created aggregated data 4161 reference because it is not addressable. */ 4162 bool need_ref_all = false; 4163 if (!alias_sets_conflict_p (get_alias_set (aggr_type), 4164 get_alias_set (DR_REF (dr)))) 4165 need_ref_all = true; 4166 /* Likewise for any of the data references in the stmt group. */ 4167 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1) 4168 { 4169 gimple orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info); 4170 do 4171 { 4172 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt); 4173 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo); 4174 if (!alias_sets_conflict_p (get_alias_set (aggr_type), 4175 get_alias_set (DR_REF (sdr)))) 4176 { 4177 need_ref_all = true; 4178 break; 4179 } 4180 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo); 4181 } 4182 while (orig_stmt); 4183 } 4184 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode, 4185 need_ref_all); 4186 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name); 4187 4188 4189 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are 4190 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two 4191 def-use update cycles for the pointer: one relative to the outer-loop 4192 (LOOP), which is what steps (3) and (4) below do. The other is relative 4193 to the inner-loop (which is the inner-most loop containing the dataref), 4194 and this is done be step (5) below. 4195 4196 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the 4197 inner-most loop, and so steps (3),(4) work the same, and step (5) is 4198 redundant. Steps (3),(4) create the following: 4199 4200 vp0 = &base_addr; 4201 LOOP: vp1 = phi(vp0,vp2) 4202 ... 4203 ... 4204 vp2 = vp1 + step 4205 goto LOOP 4206 4207 If there is an inner-loop nested in loop, then step (5) will also be 4208 applied, and an additional update in the inner-loop will be created: 4209 4210 vp0 = &base_addr; 4211 LOOP: vp1 = phi(vp0,vp2) 4212 ... 4213 inner: vp3 = phi(vp1,vp4) 4214 vp4 = vp3 + inner_step 4215 if () goto inner 4216 ... 4217 vp2 = vp1 + step 4218 if () goto LOOP */ 4219 4220 /* (2) Calculate the initial address of the aggregate-pointer, and set 4221 the aggregate-pointer to point to it before the loop. */ 4222 4223 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */ 4224 4225 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list, 4226 offset, loop, byte_offset); 4227 if (new_stmt_list) 4228 { 4229 if (pe) 4230 { 4231 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list); 4232 gcc_assert (!new_bb); 4233 } 4234 else 4235 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT); 4236 } 4237 4238 *initial_address = new_temp; 4239 4240 /* Create: p = (aggr_type *) initial_base */ 4241 if (TREE_CODE (new_temp) != SSA_NAME 4242 || !useless_type_conversion_p (aggr_ptr_type, TREE_TYPE (new_temp))) 4243 { 4244 vec_stmt = gimple_build_assign (aggr_ptr, 4245 fold_convert (aggr_ptr_type, new_temp)); 4246 aggr_ptr_init = make_ssa_name (aggr_ptr, vec_stmt); 4247 /* Copy the points-to information if it exists. */ 4248 if (DR_PTR_INFO (dr)) 4249 vect_duplicate_ssa_name_ptr_info (aggr_ptr_init, dr, stmt_info); 4250 gimple_assign_set_lhs (vec_stmt, aggr_ptr_init); 4251 if (pe) 4252 { 4253 new_bb = gsi_insert_on_edge_immediate (pe, vec_stmt); 4254 gcc_assert (!new_bb); 4255 } 4256 else 4257 gsi_insert_before (gsi, vec_stmt, GSI_SAME_STMT); 4258 } 4259 else 4260 aggr_ptr_init = new_temp; 4261 4262 /* (3) Handle the updating of the aggregate-pointer inside the loop. 4263 This is needed when ONLY_INIT is false, and also when AT_LOOP is the 4264 inner-loop nested in LOOP (during outer-loop vectorization). */ 4265 4266 /* No update in loop is required. */ 4267 if (only_init && (!loop_vinfo || at_loop == loop)) 4268 aptr = aggr_ptr_init; 4269 else 4270 { 4271 /* The step of the aggregate pointer is the type size. */ 4272 tree iv_step = TYPE_SIZE_UNIT (aggr_type); 4273 /* One exception to the above is when the scalar step of the load in 4274 LOOP is zero. In this case the step here is also zero. */ 4275 if (*inv_p) 4276 iv_step = size_zero_node; 4277 else if (tree_int_cst_sgn (step) == -1) 4278 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step); 4279 4280 standard_iv_increment_position (loop, &incr_gsi, &insert_after); 4281 4282 create_iv (aggr_ptr_init, 4283 fold_convert (aggr_ptr_type, iv_step), 4284 aggr_ptr, loop, &incr_gsi, insert_after, 4285 &indx_before_incr, &indx_after_incr); 4286 incr = gsi_stmt (incr_gsi); 4287 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL)); 4288 4289 /* Copy the points-to information if it exists. */ 4290 if (DR_PTR_INFO (dr)) 4291 { 4292 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info); 4293 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info); 4294 } 4295 if (ptr_incr) 4296 *ptr_incr = incr; 4297 4298 aptr = indx_before_incr; 4299 } 4300 4301 if (!nested_in_vect_loop || only_init) 4302 return aptr; 4303 4304 4305 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop 4306 nested in LOOP, if exists. */ 4307 4308 gcc_assert (nested_in_vect_loop); 4309 if (!only_init) 4310 { 4311 standard_iv_increment_position (containing_loop, &incr_gsi, 4312 &insert_after); 4313 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr, 4314 containing_loop, &incr_gsi, insert_after, &indx_before_incr, 4315 &indx_after_incr); 4316 incr = gsi_stmt (incr_gsi); 4317 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL)); 4318 4319 /* Copy the points-to information if it exists. */ 4320 if (DR_PTR_INFO (dr)) 4321 { 4322 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info); 4323 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info); 4324 } 4325 if (ptr_incr) 4326 *ptr_incr = incr; 4327 4328 return indx_before_incr; 4329 } 4330 else 4331 gcc_unreachable (); 4332 } 4333 4334 4335 /* Function bump_vector_ptr 4336 4337 Increment a pointer (to a vector type) by vector-size. If requested, 4338 i.e. if PTR-INCR is given, then also connect the new increment stmt 4339 to the existing def-use update-chain of the pointer, by modifying 4340 the PTR_INCR as illustrated below: 4341 4342 The pointer def-use update-chain before this function: 4343 DATAREF_PTR = phi (p_0, p_2) 4344 .... 4345 PTR_INCR: p_2 = DATAREF_PTR + step 4346 4347 The pointer def-use update-chain after this function: 4348 DATAREF_PTR = phi (p_0, p_2) 4349 .... 4350 NEW_DATAREF_PTR = DATAREF_PTR + BUMP 4351 .... 4352 PTR_INCR: p_2 = NEW_DATAREF_PTR + step 4353 4354 Input: 4355 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated 4356 in the loop. 4357 PTR_INCR - optional. The stmt that updates the pointer in each iteration of 4358 the loop. The increment amount across iterations is expected 4359 to be vector_size. 4360 BSI - location where the new update stmt is to be placed. 4361 STMT - the original scalar memory-access stmt that is being vectorized. 4362 BUMP - optional. The offset by which to bump the pointer. If not given, 4363 the offset is assumed to be vector_size. 4364 4365 Output: Return NEW_DATAREF_PTR as illustrated above. 4366 4367 */ 4368 4369 tree 4370 bump_vector_ptr (tree dataref_ptr, gimple ptr_incr, gimple_stmt_iterator *gsi, 4371 gimple stmt, tree bump) 4372 { 4373 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 4374 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); 4375 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 4376 tree update = TYPE_SIZE_UNIT (vectype); 4377 gassign *incr_stmt; 4378 ssa_op_iter iter; 4379 use_operand_p use_p; 4380 tree new_dataref_ptr; 4381 4382 if (bump) 4383 update = bump; 4384 4385 new_dataref_ptr = copy_ssa_name (dataref_ptr); 4386 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR, 4387 dataref_ptr, update); 4388 vect_finish_stmt_generation (stmt, incr_stmt, gsi); 4389 4390 /* Copy the points-to information if it exists. */ 4391 if (DR_PTR_INFO (dr)) 4392 { 4393 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr)); 4394 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr)); 4395 } 4396 4397 if (!ptr_incr) 4398 return new_dataref_ptr; 4399 4400 /* Update the vector-pointer's cross-iteration increment. */ 4401 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE) 4402 { 4403 tree use = USE_FROM_PTR (use_p); 4404 4405 if (use == dataref_ptr) 4406 SET_USE (use_p, new_dataref_ptr); 4407 else 4408 gcc_assert (tree_int_cst_compare (use, update) == 0); 4409 } 4410 4411 return new_dataref_ptr; 4412 } 4413 4414 4415 /* Function vect_create_destination_var. 4416 4417 Create a new temporary of type VECTYPE. */ 4418 4419 tree 4420 vect_create_destination_var (tree scalar_dest, tree vectype) 4421 { 4422 tree vec_dest; 4423 const char *name; 4424 char *new_name; 4425 tree type; 4426 enum vect_var_kind kind; 4427 4428 kind = vectype ? vect_simple_var : vect_scalar_var; 4429 type = vectype ? vectype : TREE_TYPE (scalar_dest); 4430 4431 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME); 4432 4433 name = get_name (scalar_dest); 4434 if (name) 4435 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest)); 4436 else 4437 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest)); 4438 vec_dest = vect_get_new_vect_var (type, kind, new_name); 4439 free (new_name); 4440 4441 return vec_dest; 4442 } 4443 4444 /* Function vect_grouped_store_supported. 4445 4446 Returns TRUE if interleave high and interleave low permutations 4447 are supported, and FALSE otherwise. */ 4448 4449 bool 4450 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count) 4451 { 4452 machine_mode mode = TYPE_MODE (vectype); 4453 4454 /* vect_permute_store_chain requires the group size to be equal to 3 or 4455 be a power of two. */ 4456 if (count != 3 && exact_log2 (count) == -1) 4457 { 4458 if (dump_enabled_p ()) 4459 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 4460 "the size of the group of accesses" 4461 " is not a power of 2 or not eqaul to 3\n"); 4462 return false; 4463 } 4464 4465 /* Check that the permutation is supported. */ 4466 if (VECTOR_MODE_P (mode)) 4467 { 4468 unsigned int i, nelt = GET_MODE_NUNITS (mode); 4469 unsigned char *sel = XALLOCAVEC (unsigned char, nelt); 4470 4471 if (count == 3) 4472 { 4473 unsigned int j0 = 0, j1 = 0, j2 = 0; 4474 unsigned int i, j; 4475 4476 for (j = 0; j < 3; j++) 4477 { 4478 int nelt0 = ((3 - j) * nelt) % 3; 4479 int nelt1 = ((3 - j) * nelt + 1) % 3; 4480 int nelt2 = ((3 - j) * nelt + 2) % 3; 4481 for (i = 0; i < nelt; i++) 4482 { 4483 if (3 * i + nelt0 < nelt) 4484 sel[3 * i + nelt0] = j0++; 4485 if (3 * i + nelt1 < nelt) 4486 sel[3 * i + nelt1] = nelt + j1++; 4487 if (3 * i + nelt2 < nelt) 4488 sel[3 * i + nelt2] = 0; 4489 } 4490 if (!can_vec_perm_p (mode, false, sel)) 4491 { 4492 if (dump_enabled_p ()) 4493 dump_printf (MSG_MISSED_OPTIMIZATION, 4494 "permutaion op not supported by target.\n"); 4495 return false; 4496 } 4497 4498 for (i = 0; i < nelt; i++) 4499 { 4500 if (3 * i + nelt0 < nelt) 4501 sel[3 * i + nelt0] = 3 * i + nelt0; 4502 if (3 * i + nelt1 < nelt) 4503 sel[3 * i + nelt1] = 3 * i + nelt1; 4504 if (3 * i + nelt2 < nelt) 4505 sel[3 * i + nelt2] = nelt + j2++; 4506 } 4507 if (!can_vec_perm_p (mode, false, sel)) 4508 { 4509 if (dump_enabled_p ()) 4510 dump_printf (MSG_MISSED_OPTIMIZATION, 4511 "permutaion op not supported by target.\n"); 4512 return false; 4513 } 4514 } 4515 return true; 4516 } 4517 else 4518 { 4519 /* If length is not equal to 3 then only power of 2 is supported. */ 4520 gcc_assert (exact_log2 (count) != -1); 4521 4522 for (i = 0; i < nelt / 2; i++) 4523 { 4524 sel[i * 2] = i; 4525 sel[i * 2 + 1] = i + nelt; 4526 } 4527 if (can_vec_perm_p (mode, false, sel)) 4528 { 4529 for (i = 0; i < nelt; i++) 4530 sel[i] += nelt / 2; 4531 if (can_vec_perm_p (mode, false, sel)) 4532 return true; 4533 } 4534 } 4535 } 4536 4537 if (dump_enabled_p ()) 4538 dump_printf (MSG_MISSED_OPTIMIZATION, 4539 "permutaion op not supported by target.\n"); 4540 return false; 4541 } 4542 4543 4544 /* Return TRUE if vec_store_lanes is available for COUNT vectors of 4545 type VECTYPE. */ 4546 4547 bool 4548 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count) 4549 { 4550 return vect_lanes_optab_supported_p ("vec_store_lanes", 4551 vec_store_lanes_optab, 4552 vectype, count); 4553 } 4554 4555 4556 /* Function vect_permute_store_chain. 4557 4558 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be 4559 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder 4560 the data correctly for the stores. Return the final references for stores 4561 in RESULT_CHAIN. 4562 4563 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8. 4564 The input is 4 vectors each containing 8 elements. We assign a number to 4565 each element, the input sequence is: 4566 4567 1st vec: 0 1 2 3 4 5 6 7 4568 2nd vec: 8 9 10 11 12 13 14 15 4569 3rd vec: 16 17 18 19 20 21 22 23 4570 4th vec: 24 25 26 27 28 29 30 31 4571 4572 The output sequence should be: 4573 4574 1st vec: 0 8 16 24 1 9 17 25 4575 2nd vec: 2 10 18 26 3 11 19 27 4576 3rd vec: 4 12 20 28 5 13 21 30 4577 4th vec: 6 14 22 30 7 15 23 31 4578 4579 i.e., we interleave the contents of the four vectors in their order. 4580 4581 We use interleave_high/low instructions to create such output. The input of 4582 each interleave_high/low operation is two vectors: 4583 1st vec 2nd vec 4584 0 1 2 3 4 5 6 7 4585 the even elements of the result vector are obtained left-to-right from the 4586 high/low elements of the first vector. The odd elements of the result are 4587 obtained left-to-right from the high/low elements of the second vector. 4588 The output of interleave_high will be: 0 4 1 5 4589 and of interleave_low: 2 6 3 7 4590 4591 4592 The permutation is done in log LENGTH stages. In each stage interleave_high 4593 and interleave_low stmts are created for each pair of vectors in DR_CHAIN, 4594 where the first argument is taken from the first half of DR_CHAIN and the 4595 second argument from it's second half. 4596 In our example, 4597 4598 I1: interleave_high (1st vec, 3rd vec) 4599 I2: interleave_low (1st vec, 3rd vec) 4600 I3: interleave_high (2nd vec, 4th vec) 4601 I4: interleave_low (2nd vec, 4th vec) 4602 4603 The output for the first stage is: 4604 4605 I1: 0 16 1 17 2 18 3 19 4606 I2: 4 20 5 21 6 22 7 23 4607 I3: 8 24 9 25 10 26 11 27 4608 I4: 12 28 13 29 14 30 15 31 4609 4610 The output of the second stage, i.e. the final result is: 4611 4612 I1: 0 8 16 24 1 9 17 25 4613 I2: 2 10 18 26 3 11 19 27 4614 I3: 4 12 20 28 5 13 21 30 4615 I4: 6 14 22 30 7 15 23 31. */ 4616 4617 void 4618 vect_permute_store_chain (vec<tree> dr_chain, 4619 unsigned int length, 4620 gimple stmt, 4621 gimple_stmt_iterator *gsi, 4622 vec<tree> *result_chain) 4623 { 4624 tree vect1, vect2, high, low; 4625 gimple perm_stmt; 4626 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)); 4627 tree perm_mask_low, perm_mask_high; 4628 tree data_ref; 4629 tree perm3_mask_low, perm3_mask_high; 4630 unsigned int i, n, log_length = exact_log2 (length); 4631 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype); 4632 unsigned char *sel = XALLOCAVEC (unsigned char, nelt); 4633 4634 result_chain->quick_grow (length); 4635 memcpy (result_chain->address (), dr_chain.address (), 4636 length * sizeof (tree)); 4637 4638 if (length == 3) 4639 { 4640 unsigned int j0 = 0, j1 = 0, j2 = 0; 4641 4642 for (j = 0; j < 3; j++) 4643 { 4644 int nelt0 = ((3 - j) * nelt) % 3; 4645 int nelt1 = ((3 - j) * nelt + 1) % 3; 4646 int nelt2 = ((3 - j) * nelt + 2) % 3; 4647 4648 for (i = 0; i < nelt; i++) 4649 { 4650 if (3 * i + nelt0 < nelt) 4651 sel[3 * i + nelt0] = j0++; 4652 if (3 * i + nelt1 < nelt) 4653 sel[3 * i + nelt1] = nelt + j1++; 4654 if (3 * i + nelt2 < nelt) 4655 sel[3 * i + nelt2] = 0; 4656 } 4657 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel); 4658 4659 for (i = 0; i < nelt; i++) 4660 { 4661 if (3 * i + nelt0 < nelt) 4662 sel[3 * i + nelt0] = 3 * i + nelt0; 4663 if (3 * i + nelt1 < nelt) 4664 sel[3 * i + nelt1] = 3 * i + nelt1; 4665 if (3 * i + nelt2 < nelt) 4666 sel[3 * i + nelt2] = nelt + j2++; 4667 } 4668 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel); 4669 4670 vect1 = dr_chain[0]; 4671 vect2 = dr_chain[1]; 4672 4673 /* Create interleaving stmt: 4674 low = VEC_PERM_EXPR <vect1, vect2, 4675 {j, nelt, *, j + 1, nelt + j + 1, *, 4676 j + 2, nelt + j + 2, *, ...}> */ 4677 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low"); 4678 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1, 4679 vect2, perm3_mask_low); 4680 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 4681 4682 vect1 = data_ref; 4683 vect2 = dr_chain[2]; 4684 /* Create interleaving stmt: 4685 low = VEC_PERM_EXPR <vect1, vect2, 4686 {0, 1, nelt + j, 3, 4, nelt + j + 1, 4687 6, 7, nelt + j + 2, ...}> */ 4688 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high"); 4689 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1, 4690 vect2, perm3_mask_high); 4691 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 4692 (*result_chain)[j] = data_ref; 4693 } 4694 } 4695 else 4696 { 4697 /* If length is not equal to 3 then only power of 2 is supported. */ 4698 gcc_assert (exact_log2 (length) != -1); 4699 4700 for (i = 0, n = nelt / 2; i < n; i++) 4701 { 4702 sel[i * 2] = i; 4703 sel[i * 2 + 1] = i + nelt; 4704 } 4705 perm_mask_high = vect_gen_perm_mask_checked (vectype, sel); 4706 4707 for (i = 0; i < nelt; i++) 4708 sel[i] += nelt / 2; 4709 perm_mask_low = vect_gen_perm_mask_checked (vectype, sel); 4710 4711 for (i = 0, n = log_length; i < n; i++) 4712 { 4713 for (j = 0; j < length/2; j++) 4714 { 4715 vect1 = dr_chain[j]; 4716 vect2 = dr_chain[j+length/2]; 4717 4718 /* Create interleaving stmt: 4719 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1, 4720 ...}> */ 4721 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high"); 4722 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1, 4723 vect2, perm_mask_high); 4724 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 4725 (*result_chain)[2*j] = high; 4726 4727 /* Create interleaving stmt: 4728 low = VEC_PERM_EXPR <vect1, vect2, 4729 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1, 4730 ...}> */ 4731 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low"); 4732 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1, 4733 vect2, perm_mask_low); 4734 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 4735 (*result_chain)[2*j+1] = low; 4736 } 4737 memcpy (dr_chain.address (), result_chain->address (), 4738 length * sizeof (tree)); 4739 } 4740 } 4741 } 4742 4743 /* Function vect_setup_realignment 4744 4745 This function is called when vectorizing an unaligned load using 4746 the dr_explicit_realign[_optimized] scheme. 4747 This function generates the following code at the loop prolog: 4748 4749 p = initial_addr; 4750 x msq_init = *(floor(p)); # prolog load 4751 realignment_token = call target_builtin; 4752 loop: 4753 x msq = phi (msq_init, ---) 4754 4755 The stmts marked with x are generated only for the case of 4756 dr_explicit_realign_optimized. 4757 4758 The code above sets up a new (vector) pointer, pointing to the first 4759 location accessed by STMT, and a "floor-aligned" load using that pointer. 4760 It also generates code to compute the "realignment-token" (if the relevant 4761 target hook was defined), and creates a phi-node at the loop-header bb 4762 whose arguments are the result of the prolog-load (created by this 4763 function) and the result of a load that takes place in the loop (to be 4764 created by the caller to this function). 4765 4766 For the case of dr_explicit_realign_optimized: 4767 The caller to this function uses the phi-result (msq) to create the 4768 realignment code inside the loop, and sets up the missing phi argument, 4769 as follows: 4770 loop: 4771 msq = phi (msq_init, lsq) 4772 lsq = *(floor(p')); # load in loop 4773 result = realign_load (msq, lsq, realignment_token); 4774 4775 For the case of dr_explicit_realign: 4776 loop: 4777 msq = *(floor(p)); # load in loop 4778 p' = p + (VS-1); 4779 lsq = *(floor(p')); # load in loop 4780 result = realign_load (msq, lsq, realignment_token); 4781 4782 Input: 4783 STMT - (scalar) load stmt to be vectorized. This load accesses 4784 a memory location that may be unaligned. 4785 BSI - place where new code is to be inserted. 4786 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes 4787 is used. 4788 4789 Output: 4790 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load 4791 target hook, if defined. 4792 Return value - the result of the loop-header phi node. */ 4793 4794 tree 4795 vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi, 4796 tree *realignment_token, 4797 enum dr_alignment_support alignment_support_scheme, 4798 tree init_addr, 4799 struct loop **at_loop) 4800 { 4801 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 4802 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 4803 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 4804 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); 4805 struct loop *loop = NULL; 4806 edge pe = NULL; 4807 tree scalar_dest = gimple_assign_lhs (stmt); 4808 tree vec_dest; 4809 gimple inc; 4810 tree ptr; 4811 tree data_ref; 4812 basic_block new_bb; 4813 tree msq_init = NULL_TREE; 4814 tree new_temp; 4815 gphi *phi_stmt; 4816 tree msq = NULL_TREE; 4817 gimple_seq stmts = NULL; 4818 bool inv_p; 4819 bool compute_in_loop = false; 4820 bool nested_in_vect_loop = false; 4821 struct loop *containing_loop = (gimple_bb (stmt))->loop_father; 4822 struct loop *loop_for_initial_load = NULL; 4823 4824 if (loop_vinfo) 4825 { 4826 loop = LOOP_VINFO_LOOP (loop_vinfo); 4827 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt); 4828 } 4829 4830 gcc_assert (alignment_support_scheme == dr_explicit_realign 4831 || alignment_support_scheme == dr_explicit_realign_optimized); 4832 4833 /* We need to generate three things: 4834 1. the misalignment computation 4835 2. the extra vector load (for the optimized realignment scheme). 4836 3. the phi node for the two vectors from which the realignment is 4837 done (for the optimized realignment scheme). */ 4838 4839 /* 1. Determine where to generate the misalignment computation. 4840 4841 If INIT_ADDR is NULL_TREE, this indicates that the misalignment 4842 calculation will be generated by this function, outside the loop (in the 4843 preheader). Otherwise, INIT_ADDR had already been computed for us by the 4844 caller, inside the loop. 4845 4846 Background: If the misalignment remains fixed throughout the iterations of 4847 the loop, then both realignment schemes are applicable, and also the 4848 misalignment computation can be done outside LOOP. This is because we are 4849 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that 4850 are a multiple of VS (the Vector Size), and therefore the misalignment in 4851 different vectorized LOOP iterations is always the same. 4852 The problem arises only if the memory access is in an inner-loop nested 4853 inside LOOP, which is now being vectorized using outer-loop vectorization. 4854 This is the only case when the misalignment of the memory access may not 4855 remain fixed throughout the iterations of the inner-loop (as explained in 4856 detail in vect_supportable_dr_alignment). In this case, not only is the 4857 optimized realignment scheme not applicable, but also the misalignment 4858 computation (and generation of the realignment token that is passed to 4859 REALIGN_LOAD) have to be done inside the loop. 4860 4861 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode 4862 or not, which in turn determines if the misalignment is computed inside 4863 the inner-loop, or outside LOOP. */ 4864 4865 if (init_addr != NULL_TREE || !loop_vinfo) 4866 { 4867 compute_in_loop = true; 4868 gcc_assert (alignment_support_scheme == dr_explicit_realign); 4869 } 4870 4871 4872 /* 2. Determine where to generate the extra vector load. 4873 4874 For the optimized realignment scheme, instead of generating two vector 4875 loads in each iteration, we generate a single extra vector load in the 4876 preheader of the loop, and in each iteration reuse the result of the 4877 vector load from the previous iteration. In case the memory access is in 4878 an inner-loop nested inside LOOP, which is now being vectorized using 4879 outer-loop vectorization, we need to determine whether this initial vector 4880 load should be generated at the preheader of the inner-loop, or can be 4881 generated at the preheader of LOOP. If the memory access has no evolution 4882 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has 4883 to be generated inside LOOP (in the preheader of the inner-loop). */ 4884 4885 if (nested_in_vect_loop) 4886 { 4887 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info); 4888 bool invariant_in_outerloop = 4889 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0); 4890 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner); 4891 } 4892 else 4893 loop_for_initial_load = loop; 4894 if (at_loop) 4895 *at_loop = loop_for_initial_load; 4896 4897 if (loop_for_initial_load) 4898 pe = loop_preheader_edge (loop_for_initial_load); 4899 4900 /* 3. For the case of the optimized realignment, create the first vector 4901 load at the loop preheader. */ 4902 4903 if (alignment_support_scheme == dr_explicit_realign_optimized) 4904 { 4905 /* Create msq_init = *(floor(p1)) in the loop preheader */ 4906 gassign *new_stmt; 4907 4908 gcc_assert (!compute_in_loop); 4909 vec_dest = vect_create_destination_var (scalar_dest, vectype); 4910 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load, 4911 NULL_TREE, &init_addr, NULL, &inc, 4912 true, &inv_p); 4913 new_temp = copy_ssa_name (ptr); 4914 new_stmt = gimple_build_assign 4915 (new_temp, BIT_AND_EXPR, ptr, 4916 build_int_cst (TREE_TYPE (ptr), 4917 -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype))); 4918 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt); 4919 gcc_assert (!new_bb); 4920 data_ref 4921 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp, 4922 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0)); 4923 new_stmt = gimple_build_assign (vec_dest, data_ref); 4924 new_temp = make_ssa_name (vec_dest, new_stmt); 4925 gimple_assign_set_lhs (new_stmt, new_temp); 4926 if (pe) 4927 { 4928 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt); 4929 gcc_assert (!new_bb); 4930 } 4931 else 4932 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); 4933 4934 msq_init = gimple_assign_lhs (new_stmt); 4935 } 4936 4937 /* 4. Create realignment token using a target builtin, if available. 4938 It is done either inside the containing loop, or before LOOP (as 4939 determined above). */ 4940 4941 if (targetm.vectorize.builtin_mask_for_load) 4942 { 4943 gcall *new_stmt; 4944 tree builtin_decl; 4945 4946 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */ 4947 if (!init_addr) 4948 { 4949 /* Generate the INIT_ADDR computation outside LOOP. */ 4950 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts, 4951 NULL_TREE, loop); 4952 if (loop) 4953 { 4954 pe = loop_preheader_edge (loop); 4955 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); 4956 gcc_assert (!new_bb); 4957 } 4958 else 4959 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); 4960 } 4961 4962 builtin_decl = targetm.vectorize.builtin_mask_for_load (); 4963 new_stmt = gimple_build_call (builtin_decl, 1, init_addr); 4964 vec_dest = 4965 vect_create_destination_var (scalar_dest, 4966 gimple_call_return_type (new_stmt)); 4967 new_temp = make_ssa_name (vec_dest, new_stmt); 4968 gimple_call_set_lhs (new_stmt, new_temp); 4969 4970 if (compute_in_loop) 4971 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); 4972 else 4973 { 4974 /* Generate the misalignment computation outside LOOP. */ 4975 pe = loop_preheader_edge (loop); 4976 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt); 4977 gcc_assert (!new_bb); 4978 } 4979 4980 *realignment_token = gimple_call_lhs (new_stmt); 4981 4982 /* The result of the CALL_EXPR to this builtin is determined from 4983 the value of the parameter and no global variables are touched 4984 which makes the builtin a "const" function. Requiring the 4985 builtin to have the "const" attribute makes it unnecessary 4986 to call mark_call_clobbered. */ 4987 gcc_assert (TREE_READONLY (builtin_decl)); 4988 } 4989 4990 if (alignment_support_scheme == dr_explicit_realign) 4991 return msq; 4992 4993 gcc_assert (!compute_in_loop); 4994 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized); 4995 4996 4997 /* 5. Create msq = phi <msq_init, lsq> in loop */ 4998 4999 pe = loop_preheader_edge (containing_loop); 5000 vec_dest = vect_create_destination_var (scalar_dest, vectype); 5001 msq = make_ssa_name (vec_dest); 5002 phi_stmt = create_phi_node (msq, containing_loop->header); 5003 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION); 5004 5005 return msq; 5006 } 5007 5008 5009 /* Function vect_grouped_load_supported. 5010 5011 Returns TRUE if even and odd permutations are supported, 5012 and FALSE otherwise. */ 5013 5014 bool 5015 vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count) 5016 { 5017 machine_mode mode = TYPE_MODE (vectype); 5018 5019 /* vect_permute_load_chain requires the group size to be equal to 3 or 5020 be a power of two. */ 5021 if (count != 3 && exact_log2 (count) == -1) 5022 { 5023 if (dump_enabled_p ()) 5024 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5025 "the size of the group of accesses" 5026 " is not a power of 2 or not equal to 3\n"); 5027 return false; 5028 } 5029 5030 /* Check that the permutation is supported. */ 5031 if (VECTOR_MODE_P (mode)) 5032 { 5033 unsigned int i, j, nelt = GET_MODE_NUNITS (mode); 5034 unsigned char *sel = XALLOCAVEC (unsigned char, nelt); 5035 5036 if (count == 3) 5037 { 5038 unsigned int k; 5039 for (k = 0; k < 3; k++) 5040 { 5041 for (i = 0; i < nelt; i++) 5042 if (3 * i + k < 2 * nelt) 5043 sel[i] = 3 * i + k; 5044 else 5045 sel[i] = 0; 5046 if (!can_vec_perm_p (mode, false, sel)) 5047 { 5048 if (dump_enabled_p ()) 5049 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5050 "shuffle of 3 loads is not supported by" 5051 " target\n"); 5052 return false; 5053 } 5054 for (i = 0, j = 0; i < nelt; i++) 5055 if (3 * i + k < 2 * nelt) 5056 sel[i] = i; 5057 else 5058 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++); 5059 if (!can_vec_perm_p (mode, false, sel)) 5060 { 5061 if (dump_enabled_p ()) 5062 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5063 "shuffle of 3 loads is not supported by" 5064 " target\n"); 5065 return false; 5066 } 5067 } 5068 return true; 5069 } 5070 else 5071 { 5072 /* If length is not equal to 3 then only power of 2 is supported. */ 5073 gcc_assert (exact_log2 (count) != -1); 5074 for (i = 0; i < nelt; i++) 5075 sel[i] = i * 2; 5076 if (can_vec_perm_p (mode, false, sel)) 5077 { 5078 for (i = 0; i < nelt; i++) 5079 sel[i] = i * 2 + 1; 5080 if (can_vec_perm_p (mode, false, sel)) 5081 return true; 5082 } 5083 } 5084 } 5085 5086 if (dump_enabled_p ()) 5087 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5088 "extract even/odd not supported by target\n"); 5089 return false; 5090 } 5091 5092 /* Return TRUE if vec_load_lanes is available for COUNT vectors of 5093 type VECTYPE. */ 5094 5095 bool 5096 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count) 5097 { 5098 return vect_lanes_optab_supported_p ("vec_load_lanes", 5099 vec_load_lanes_optab, 5100 vectype, count); 5101 } 5102 5103 /* Function vect_permute_load_chain. 5104 5105 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be 5106 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder 5107 the input data correctly. Return the final references for loads in 5108 RESULT_CHAIN. 5109 5110 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8. 5111 The input is 4 vectors each containing 8 elements. We assign a number to each 5112 element, the input sequence is: 5113 5114 1st vec: 0 1 2 3 4 5 6 7 5115 2nd vec: 8 9 10 11 12 13 14 15 5116 3rd vec: 16 17 18 19 20 21 22 23 5117 4th vec: 24 25 26 27 28 29 30 31 5118 5119 The output sequence should be: 5120 5121 1st vec: 0 4 8 12 16 20 24 28 5122 2nd vec: 1 5 9 13 17 21 25 29 5123 3rd vec: 2 6 10 14 18 22 26 30 5124 4th vec: 3 7 11 15 19 23 27 31 5125 5126 i.e., the first output vector should contain the first elements of each 5127 interleaving group, etc. 5128 5129 We use extract_even/odd instructions to create such output. The input of 5130 each extract_even/odd operation is two vectors 5131 1st vec 2nd vec 5132 0 1 2 3 4 5 6 7 5133 5134 and the output is the vector of extracted even/odd elements. The output of 5135 extract_even will be: 0 2 4 6 5136 and of extract_odd: 1 3 5 7 5137 5138 5139 The permutation is done in log LENGTH stages. In each stage extract_even 5140 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in 5141 their order. In our example, 5142 5143 E1: extract_even (1st vec, 2nd vec) 5144 E2: extract_odd (1st vec, 2nd vec) 5145 E3: extract_even (3rd vec, 4th vec) 5146 E4: extract_odd (3rd vec, 4th vec) 5147 5148 The output for the first stage will be: 5149 5150 E1: 0 2 4 6 8 10 12 14 5151 E2: 1 3 5 7 9 11 13 15 5152 E3: 16 18 20 22 24 26 28 30 5153 E4: 17 19 21 23 25 27 29 31 5154 5155 In order to proceed and create the correct sequence for the next stage (or 5156 for the correct output, if the second stage is the last one, as in our 5157 example), we first put the output of extract_even operation and then the 5158 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN). 5159 The input for the second stage is: 5160 5161 1st vec (E1): 0 2 4 6 8 10 12 14 5162 2nd vec (E3): 16 18 20 22 24 26 28 30 5163 3rd vec (E2): 1 3 5 7 9 11 13 15 5164 4th vec (E4): 17 19 21 23 25 27 29 31 5165 5166 The output of the second stage: 5167 5168 E1: 0 4 8 12 16 20 24 28 5169 E2: 2 6 10 14 18 22 26 30 5170 E3: 1 5 9 13 17 21 25 29 5171 E4: 3 7 11 15 19 23 27 31 5172 5173 And RESULT_CHAIN after reordering: 5174 5175 1st vec (E1): 0 4 8 12 16 20 24 28 5176 2nd vec (E3): 1 5 9 13 17 21 25 29 5177 3rd vec (E2): 2 6 10 14 18 22 26 30 5178 4th vec (E4): 3 7 11 15 19 23 27 31. */ 5179 5180 static void 5181 vect_permute_load_chain (vec<tree> dr_chain, 5182 unsigned int length, 5183 gimple stmt, 5184 gimple_stmt_iterator *gsi, 5185 vec<tree> *result_chain) 5186 { 5187 tree data_ref, first_vect, second_vect; 5188 tree perm_mask_even, perm_mask_odd; 5189 tree perm3_mask_low, perm3_mask_high; 5190 gimple perm_stmt; 5191 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)); 5192 unsigned int i, j, log_length = exact_log2 (length); 5193 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype); 5194 unsigned char *sel = XALLOCAVEC (unsigned char, nelt); 5195 5196 result_chain->quick_grow (length); 5197 memcpy (result_chain->address (), dr_chain.address (), 5198 length * sizeof (tree)); 5199 5200 if (length == 3) 5201 { 5202 unsigned int k; 5203 5204 for (k = 0; k < 3; k++) 5205 { 5206 for (i = 0; i < nelt; i++) 5207 if (3 * i + k < 2 * nelt) 5208 sel[i] = 3 * i + k; 5209 else 5210 sel[i] = 0; 5211 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel); 5212 5213 for (i = 0, j = 0; i < nelt; i++) 5214 if (3 * i + k < 2 * nelt) 5215 sel[i] = i; 5216 else 5217 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++); 5218 5219 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel); 5220 5221 first_vect = dr_chain[0]; 5222 second_vect = dr_chain[1]; 5223 5224 /* Create interleaving stmt (low part of): 5225 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k, 5226 ...}> */ 5227 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low"); 5228 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect, 5229 second_vect, perm3_mask_low); 5230 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5231 5232 /* Create interleaving stmt (high part of): 5233 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k, 5234 ...}> */ 5235 first_vect = data_ref; 5236 second_vect = dr_chain[2]; 5237 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high"); 5238 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect, 5239 second_vect, perm3_mask_high); 5240 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5241 (*result_chain)[k] = data_ref; 5242 } 5243 } 5244 else 5245 { 5246 /* If length is not equal to 3 then only power of 2 is supported. */ 5247 gcc_assert (exact_log2 (length) != -1); 5248 5249 for (i = 0; i < nelt; ++i) 5250 sel[i] = i * 2; 5251 perm_mask_even = vect_gen_perm_mask_checked (vectype, sel); 5252 5253 for (i = 0; i < nelt; ++i) 5254 sel[i] = i * 2 + 1; 5255 perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel); 5256 5257 for (i = 0; i < log_length; i++) 5258 { 5259 for (j = 0; j < length; j += 2) 5260 { 5261 first_vect = dr_chain[j]; 5262 second_vect = dr_chain[j+1]; 5263 5264 /* data_ref = permute_even (first_data_ref, second_data_ref); */ 5265 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even"); 5266 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 5267 first_vect, second_vect, 5268 perm_mask_even); 5269 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5270 (*result_chain)[j/2] = data_ref; 5271 5272 /* data_ref = permute_odd (first_data_ref, second_data_ref); */ 5273 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd"); 5274 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 5275 first_vect, second_vect, 5276 perm_mask_odd); 5277 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5278 (*result_chain)[j/2+length/2] = data_ref; 5279 } 5280 memcpy (dr_chain.address (), result_chain->address (), 5281 length * sizeof (tree)); 5282 } 5283 } 5284 } 5285 5286 /* Function vect_shift_permute_load_chain. 5287 5288 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate 5289 sequence of stmts to reorder the input data accordingly. 5290 Return the final references for loads in RESULT_CHAIN. 5291 Return true if successed, false otherwise. 5292 5293 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8. 5294 The input is 3 vectors each containing 8 elements. We assign a 5295 number to each element, the input sequence is: 5296 5297 1st vec: 0 1 2 3 4 5 6 7 5298 2nd vec: 8 9 10 11 12 13 14 15 5299 3rd vec: 16 17 18 19 20 21 22 23 5300 5301 The output sequence should be: 5302 5303 1st vec: 0 3 6 9 12 15 18 21 5304 2nd vec: 1 4 7 10 13 16 19 22 5305 3rd vec: 2 5 8 11 14 17 20 23 5306 5307 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output. 5308 5309 First we shuffle all 3 vectors to get correct elements order: 5310 5311 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5) 5312 2nd vec: ( 8 11 14) ( 9 12 15) (10 13) 5313 3rd vec: (16 19 22) (17 20 23) (18 21) 5314 5315 Next we unite and shift vector 3 times: 5316 5317 1st step: 5318 shift right by 6 the concatenation of: 5319 "1st vec" and "2nd vec" 5320 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13) 5321 "2nd vec" and "3rd vec" 5322 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21) 5323 "3rd vec" and "1st vec" 5324 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5) 5325 | New vectors | 5326 5327 So that now new vectors are: 5328 5329 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15) 5330 2nd vec: (10 13) (16 19 22) (17 20 23) 5331 3rd vec: (18 21) ( 0 3 6) ( 1 4 7) 5332 5333 2nd step: 5334 shift right by 5 the concatenation of: 5335 "1st vec" and "3rd vec" 5336 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7) 5337 "2nd vec" and "1st vec" 5338 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15) 5339 "3rd vec" and "2nd vec" 5340 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23) 5341 | New vectors | 5342 5343 So that now new vectors are: 5344 5345 1st vec: ( 9 12 15) (18 21) ( 0 3 6) 5346 2nd vec: (17 20 23) ( 2 5) ( 8 11 14) 5347 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY 5348 5349 3rd step: 5350 shift right by 5 the concatenation of: 5351 "1st vec" and "1st vec" 5352 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6) 5353 shift right by 3 the concatenation of: 5354 "2nd vec" and "2nd vec" 5355 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14) 5356 | New vectors | 5357 5358 So that now all vectors are READY: 5359 1st vec: ( 0 3 6) ( 9 12 15) (18 21) 5360 2nd vec: ( 2 5) ( 8 11 14) (17 20 23) 5361 3rd vec: ( 1 4 7) (10 13) (16 19 22) 5362 5363 This algorithm is faster than one in vect_permute_load_chain if: 5364 1. "shift of a concatination" is faster than general permutation. 5365 This is usually so. 5366 2. The TARGET machine can't execute vector instructions in parallel. 5367 This is because each step of the algorithm depends on previous. 5368 The algorithm in vect_permute_load_chain is much more parallel. 5369 5370 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF. 5371 */ 5372 5373 static bool 5374 vect_shift_permute_load_chain (vec<tree> dr_chain, 5375 unsigned int length, 5376 gimple stmt, 5377 gimple_stmt_iterator *gsi, 5378 vec<tree> *result_chain) 5379 { 5380 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect; 5381 tree perm2_mask1, perm2_mask2, perm3_mask; 5382 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask; 5383 gimple perm_stmt; 5384 5385 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)); 5386 unsigned int i; 5387 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype); 5388 unsigned char *sel = XALLOCAVEC (unsigned char, nelt); 5389 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 5390 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 5391 5392 result_chain->quick_grow (length); 5393 memcpy (result_chain->address (), dr_chain.address (), 5394 length * sizeof (tree)); 5395 5396 if (exact_log2 (length) != -1 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4) 5397 { 5398 unsigned int j, log_length = exact_log2 (length); 5399 for (i = 0; i < nelt / 2; ++i) 5400 sel[i] = i * 2; 5401 for (i = 0; i < nelt / 2; ++i) 5402 sel[nelt / 2 + i] = i * 2 + 1; 5403 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) 5404 { 5405 if (dump_enabled_p ()) 5406 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5407 "shuffle of 2 fields structure is not \ 5408 supported by target\n"); 5409 return false; 5410 } 5411 perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel); 5412 5413 for (i = 0; i < nelt / 2; ++i) 5414 sel[i] = i * 2 + 1; 5415 for (i = 0; i < nelt / 2; ++i) 5416 sel[nelt / 2 + i] = i * 2; 5417 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) 5418 { 5419 if (dump_enabled_p ()) 5420 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5421 "shuffle of 2 fields structure is not \ 5422 supported by target\n"); 5423 return false; 5424 } 5425 perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel); 5426 5427 /* Generating permutation constant to shift all elements. 5428 For vector length 8 it is {4 5 6 7 8 9 10 11}. */ 5429 for (i = 0; i < nelt; i++) 5430 sel[i] = nelt / 2 + i; 5431 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) 5432 { 5433 if (dump_enabled_p ()) 5434 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5435 "shift permutation is not supported by target\n"); 5436 return false; 5437 } 5438 shift1_mask = vect_gen_perm_mask_checked (vectype, sel); 5439 5440 /* Generating permutation constant to select vector from 2. 5441 For vector length 8 it is {0 1 2 3 12 13 14 15}. */ 5442 for (i = 0; i < nelt / 2; i++) 5443 sel[i] = i; 5444 for (i = nelt / 2; i < nelt; i++) 5445 sel[i] = nelt + i; 5446 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) 5447 { 5448 if (dump_enabled_p ()) 5449 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5450 "select is not supported by target\n"); 5451 return false; 5452 } 5453 select_mask = vect_gen_perm_mask_checked (vectype, sel); 5454 5455 for (i = 0; i < log_length; i++) 5456 { 5457 for (j = 0; j < length; j += 2) 5458 { 5459 first_vect = dr_chain[j]; 5460 second_vect = dr_chain[j + 1]; 5461 5462 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2"); 5463 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 5464 first_vect, first_vect, 5465 perm2_mask1); 5466 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5467 vect[0] = data_ref; 5468 5469 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2"); 5470 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 5471 second_vect, second_vect, 5472 perm2_mask2); 5473 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5474 vect[1] = data_ref; 5475 5476 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift"); 5477 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 5478 vect[0], vect[1], shift1_mask); 5479 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5480 (*result_chain)[j/2 + length/2] = data_ref; 5481 5482 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select"); 5483 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 5484 vect[0], vect[1], select_mask); 5485 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5486 (*result_chain)[j/2] = data_ref; 5487 } 5488 memcpy (dr_chain.address (), result_chain->address (), 5489 length * sizeof (tree)); 5490 } 5491 return true; 5492 } 5493 if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2) 5494 { 5495 unsigned int k = 0, l = 0; 5496 5497 /* Generating permutation constant to get all elements in rigth order. 5498 For vector length 8 it is {0 3 6 1 4 7 2 5}. */ 5499 for (i = 0; i < nelt; i++) 5500 { 5501 if (3 * k + (l % 3) >= nelt) 5502 { 5503 k = 0; 5504 l += (3 - (nelt % 3)); 5505 } 5506 sel[i] = 3 * k + (l % 3); 5507 k++; 5508 } 5509 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) 5510 { 5511 if (dump_enabled_p ()) 5512 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5513 "shuffle of 3 fields structure is not \ 5514 supported by target\n"); 5515 return false; 5516 } 5517 perm3_mask = vect_gen_perm_mask_checked (vectype, sel); 5518 5519 /* Generating permutation constant to shift all elements. 5520 For vector length 8 it is {6 7 8 9 10 11 12 13}. */ 5521 for (i = 0; i < nelt; i++) 5522 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i; 5523 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) 5524 { 5525 if (dump_enabled_p ()) 5526 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5527 "shift permutation is not supported by target\n"); 5528 return false; 5529 } 5530 shift1_mask = vect_gen_perm_mask_checked (vectype, sel); 5531 5532 /* Generating permutation constant to shift all elements. 5533 For vector length 8 it is {5 6 7 8 9 10 11 12}. */ 5534 for (i = 0; i < nelt; i++) 5535 sel[i] = 2 * (nelt / 3) + 1 + i; 5536 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) 5537 { 5538 if (dump_enabled_p ()) 5539 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5540 "shift permutation is not supported by target\n"); 5541 return false; 5542 } 5543 shift2_mask = vect_gen_perm_mask_checked (vectype, sel); 5544 5545 /* Generating permutation constant to shift all elements. 5546 For vector length 8 it is {3 4 5 6 7 8 9 10}. */ 5547 for (i = 0; i < nelt; i++) 5548 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i; 5549 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) 5550 { 5551 if (dump_enabled_p ()) 5552 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5553 "shift permutation is not supported by target\n"); 5554 return false; 5555 } 5556 shift3_mask = vect_gen_perm_mask_checked (vectype, sel); 5557 5558 /* Generating permutation constant to shift all elements. 5559 For vector length 8 it is {5 6 7 8 9 10 11 12}. */ 5560 for (i = 0; i < nelt; i++) 5561 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i; 5562 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) 5563 { 5564 if (dump_enabled_p ()) 5565 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5566 "shift permutation is not supported by target\n"); 5567 return false; 5568 } 5569 shift4_mask = vect_gen_perm_mask_checked (vectype, sel); 5570 5571 for (k = 0; k < 3; k++) 5572 { 5573 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3"); 5574 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 5575 dr_chain[k], dr_chain[k], 5576 perm3_mask); 5577 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5578 vect[k] = data_ref; 5579 } 5580 5581 for (k = 0; k < 3; k++) 5582 { 5583 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1"); 5584 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 5585 vect[k % 3], vect[(k + 1) % 3], 5586 shift1_mask); 5587 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5588 vect_shift[k] = data_ref; 5589 } 5590 5591 for (k = 0; k < 3; k++) 5592 { 5593 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2"); 5594 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 5595 vect_shift[(4 - k) % 3], 5596 vect_shift[(3 - k) % 3], 5597 shift2_mask); 5598 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5599 vect[k] = data_ref; 5600 } 5601 5602 (*result_chain)[3 - (nelt % 3)] = vect[2]; 5603 5604 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3"); 5605 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0], 5606 vect[0], shift3_mask); 5607 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5608 (*result_chain)[nelt % 3] = data_ref; 5609 5610 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4"); 5611 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1], 5612 vect[1], shift4_mask); 5613 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5614 (*result_chain)[0] = data_ref; 5615 return true; 5616 } 5617 return false; 5618 } 5619 5620 /* Function vect_transform_grouped_load. 5621 5622 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements 5623 to perform their permutation and ascribe the result vectorized statements to 5624 the scalar statements. 5625 */ 5626 5627 void 5628 vect_transform_grouped_load (gimple stmt, vec<tree> dr_chain, int size, 5629 gimple_stmt_iterator *gsi) 5630 { 5631 machine_mode mode; 5632 vec<tree> result_chain = vNULL; 5633 5634 /* DR_CHAIN contains input data-refs that are a part of the interleaving. 5635 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted 5636 vectors, that are ready for vector computation. */ 5637 result_chain.create (size); 5638 5639 /* If reassociation width for vector type is 2 or greater target machine can 5640 execute 2 or more vector instructions in parallel. Otherwise try to 5641 get chain for loads group using vect_shift_permute_load_chain. */ 5642 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt))); 5643 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1 5644 || exact_log2 (size) != -1 5645 || !vect_shift_permute_load_chain (dr_chain, size, stmt, 5646 gsi, &result_chain)) 5647 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain); 5648 vect_record_grouped_load_vectors (stmt, result_chain); 5649 result_chain.release (); 5650 } 5651 5652 /* RESULT_CHAIN contains the output of a group of grouped loads that were 5653 generated as part of the vectorization of STMT. Assign the statement 5654 for each vector to the associated scalar statement. */ 5655 5656 void 5657 vect_record_grouped_load_vectors (gimple stmt, vec<tree> result_chain) 5658 { 5659 gimple first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)); 5660 gimple next_stmt, new_stmt; 5661 unsigned int i, gap_count; 5662 tree tmp_data_ref; 5663 5664 /* Put a permuted data-ref in the VECTORIZED_STMT field. 5665 Since we scan the chain starting from it's first node, their order 5666 corresponds the order of data-refs in RESULT_CHAIN. */ 5667 next_stmt = first_stmt; 5668 gap_count = 1; 5669 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref) 5670 { 5671 if (!next_stmt) 5672 break; 5673 5674 /* Skip the gaps. Loads created for the gaps will be removed by dead 5675 code elimination pass later. No need to check for the first stmt in 5676 the group, since it always exists. 5677 GROUP_GAP is the number of steps in elements from the previous 5678 access (if there is no gap GROUP_GAP is 1). We skip loads that 5679 correspond to the gaps. */ 5680 if (next_stmt != first_stmt 5681 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt))) 5682 { 5683 gap_count++; 5684 continue; 5685 } 5686 5687 while (next_stmt) 5688 { 5689 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref); 5690 /* We assume that if VEC_STMT is not NULL, this is a case of multiple 5691 copies, and we put the new vector statement in the first available 5692 RELATED_STMT. */ 5693 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt))) 5694 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt; 5695 else 5696 { 5697 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt))) 5698 { 5699 gimple prev_stmt = 5700 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)); 5701 gimple rel_stmt = 5702 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)); 5703 while (rel_stmt) 5704 { 5705 prev_stmt = rel_stmt; 5706 rel_stmt = 5707 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt)); 5708 } 5709 5710 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) = 5711 new_stmt; 5712 } 5713 } 5714 5715 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt)); 5716 gap_count = 1; 5717 /* If NEXT_STMT accesses the same DR as the previous statement, 5718 put the same TMP_DATA_REF as its vectorized statement; otherwise 5719 get the next data-ref from RESULT_CHAIN. */ 5720 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt))) 5721 break; 5722 } 5723 } 5724 } 5725 5726 /* Function vect_force_dr_alignment_p. 5727 5728 Returns whether the alignment of a DECL can be forced to be aligned 5729 on ALIGNMENT bit boundary. */ 5730 5731 bool 5732 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment) 5733 { 5734 if (TREE_CODE (decl) != VAR_DECL) 5735 return false; 5736 5737 if (decl_in_symtab_p (decl) 5738 && !symtab_node::get (decl)->can_increase_alignment_p ()) 5739 return false; 5740 5741 if (TREE_STATIC (decl)) 5742 return (alignment <= MAX_OFILE_ALIGNMENT); 5743 else 5744 return (alignment <= MAX_STACK_ALIGNMENT); 5745 } 5746 5747 5748 /* Return whether the data reference DR is supported with respect to its 5749 alignment. 5750 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even 5751 it is aligned, i.e., check if it is possible to vectorize it with different 5752 alignment. */ 5753 5754 enum dr_alignment_support 5755 vect_supportable_dr_alignment (struct data_reference *dr, 5756 bool check_aligned_accesses) 5757 { 5758 gimple stmt = DR_STMT (dr); 5759 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 5760 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 5761 machine_mode mode = TYPE_MODE (vectype); 5762 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 5763 struct loop *vect_loop = NULL; 5764 bool nested_in_vect_loop = false; 5765 5766 if (aligned_access_p (dr) && !check_aligned_accesses) 5767 return dr_aligned; 5768 5769 /* For now assume all conditional loads/stores support unaligned 5770 access without any special code. */ 5771 if (is_gimple_call (stmt) 5772 && gimple_call_internal_p (stmt) 5773 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD 5774 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)) 5775 return dr_unaligned_supported; 5776 5777 if (loop_vinfo) 5778 { 5779 vect_loop = LOOP_VINFO_LOOP (loop_vinfo); 5780 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt); 5781 } 5782 5783 /* Possibly unaligned access. */ 5784 5785 /* We can choose between using the implicit realignment scheme (generating 5786 a misaligned_move stmt) and the explicit realignment scheme (generating 5787 aligned loads with a REALIGN_LOAD). There are two variants to the 5788 explicit realignment scheme: optimized, and unoptimized. 5789 We can optimize the realignment only if the step between consecutive 5790 vector loads is equal to the vector size. Since the vector memory 5791 accesses advance in steps of VS (Vector Size) in the vectorized loop, it 5792 is guaranteed that the misalignment amount remains the same throughout the 5793 execution of the vectorized loop. Therefore, we can create the 5794 "realignment token" (the permutation mask that is passed to REALIGN_LOAD) 5795 at the loop preheader. 5796 5797 However, in the case of outer-loop vectorization, when vectorizing a 5798 memory access in the inner-loop nested within the LOOP that is now being 5799 vectorized, while it is guaranteed that the misalignment of the 5800 vectorized memory access will remain the same in different outer-loop 5801 iterations, it is *not* guaranteed that is will remain the same throughout 5802 the execution of the inner-loop. This is because the inner-loop advances 5803 with the original scalar step (and not in steps of VS). If the inner-loop 5804 step happens to be a multiple of VS, then the misalignment remains fixed 5805 and we can use the optimized realignment scheme. For example: 5806 5807 for (i=0; i<N; i++) 5808 for (j=0; j<M; j++) 5809 s += a[i+j]; 5810 5811 When vectorizing the i-loop in the above example, the step between 5812 consecutive vector loads is 1, and so the misalignment does not remain 5813 fixed across the execution of the inner-loop, and the realignment cannot 5814 be optimized (as illustrated in the following pseudo vectorized loop): 5815 5816 for (i=0; i<N; i+=4) 5817 for (j=0; j<M; j++){ 5818 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...} 5819 // when j is {0,1,2,3,4,5,6,7,...} respectively. 5820 // (assuming that we start from an aligned address). 5821 } 5822 5823 We therefore have to use the unoptimized realignment scheme: 5824 5825 for (i=0; i<N; i+=4) 5826 for (j=k; j<M; j+=4) 5827 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming 5828 // that the misalignment of the initial address is 5829 // 0). 5830 5831 The loop can then be vectorized as follows: 5832 5833 for (k=0; k<4; k++){ 5834 rt = get_realignment_token (&vp[k]); 5835 for (i=0; i<N; i+=4){ 5836 v1 = vp[i+k]; 5837 for (j=k; j<M; j+=4){ 5838 v2 = vp[i+j+VS-1]; 5839 va = REALIGN_LOAD <v1,v2,rt>; 5840 vs += va; 5841 v1 = v2; 5842 } 5843 } 5844 } */ 5845 5846 if (DR_IS_READ (dr)) 5847 { 5848 bool is_packed = false; 5849 tree type = (TREE_TYPE (DR_REF (dr))); 5850 5851 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing 5852 && (!targetm.vectorize.builtin_mask_for_load 5853 || targetm.vectorize.builtin_mask_for_load ())) 5854 { 5855 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 5856 if ((nested_in_vect_loop 5857 && (TREE_INT_CST_LOW (DR_STEP (dr)) 5858 != GET_MODE_SIZE (TYPE_MODE (vectype)))) 5859 || !loop_vinfo) 5860 return dr_explicit_realign; 5861 else 5862 return dr_explicit_realign_optimized; 5863 } 5864 if (!known_alignment_for_access_p (dr)) 5865 is_packed = not_size_aligned (DR_REF (dr)); 5866 5867 if ((TYPE_USER_ALIGN (type) && !is_packed) 5868 || targetm.vectorize. 5869 support_vector_misalignment (mode, type, 5870 DR_MISALIGNMENT (dr), is_packed)) 5871 /* Can't software pipeline the loads, but can at least do them. */ 5872 return dr_unaligned_supported; 5873 } 5874 else 5875 { 5876 bool is_packed = false; 5877 tree type = (TREE_TYPE (DR_REF (dr))); 5878 5879 if (!known_alignment_for_access_p (dr)) 5880 is_packed = not_size_aligned (DR_REF (dr)); 5881 5882 if ((TYPE_USER_ALIGN (type) && !is_packed) 5883 || targetm.vectorize. 5884 support_vector_misalignment (mode, type, 5885 DR_MISALIGNMENT (dr), is_packed)) 5886 return dr_unaligned_supported; 5887 } 5888 5889 /* Unsupported. */ 5890 return dr_unaligned_unsupported; 5891 } 5892