1 /* Data References Analysis and Manipulation Utilities for Vectorization. 2 Copyright (C) 2003-2016 Free Software Foundation, Inc. 3 Contributed by Dorit Naishlos <dorit@il.ibm.com> 4 and Ira Rosen <irar@il.ibm.com> 5 6 This file is part of GCC. 7 8 GCC is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free 10 Software Foundation; either version 3, or (at your option) any later 11 version. 12 13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or 15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 16 for more details. 17 18 You should have received a copy of the GNU General Public License 19 along with GCC; see the file COPYING3. If not see 20 <http://www.gnu.org/licenses/>. */ 21 22 #include "config.h" 23 #include "system.h" 24 #include "coretypes.h" 25 #include "backend.h" 26 #include "target.h" 27 #include "rtl.h" 28 #include "tree.h" 29 #include "gimple.h" 30 #include "predict.h" 31 #include "tm_p.h" 32 #include "ssa.h" 33 #include "optabs-tree.h" 34 #include "cgraph.h" 35 #include "dumpfile.h" 36 #include "alias.h" 37 #include "fold-const.h" 38 #include "stor-layout.h" 39 #include "tree-eh.h" 40 #include "gimplify.h" 41 #include "gimple-iterator.h" 42 #include "gimplify-me.h" 43 #include "tree-ssa-loop-ivopts.h" 44 #include "tree-ssa-loop-manip.h" 45 #include "tree-ssa-loop.h" 46 #include "cfgloop.h" 47 #include "tree-scalar-evolution.h" 48 #include "tree-vectorizer.h" 49 #include "expr.h" 50 #include "builtins.h" 51 #include "params.h" 52 53 /* Return true if load- or store-lanes optab OPTAB is implemented for 54 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */ 55 56 static bool 57 vect_lanes_optab_supported_p (const char *name, convert_optab optab, 58 tree vectype, unsigned HOST_WIDE_INT count) 59 { 60 machine_mode mode, array_mode; 61 bool limit_p; 62 63 mode = TYPE_MODE (vectype); 64 limit_p = !targetm.array_mode_supported_p (mode, count); 65 array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode), 66 MODE_INT, limit_p); 67 68 if (array_mode == BLKmode) 69 { 70 if (dump_enabled_p ()) 71 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 72 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n", 73 GET_MODE_NAME (mode), count); 74 return false; 75 } 76 77 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing) 78 { 79 if (dump_enabled_p ()) 80 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 81 "cannot use %s<%s><%s>\n", name, 82 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode)); 83 return false; 84 } 85 86 if (dump_enabled_p ()) 87 dump_printf_loc (MSG_NOTE, vect_location, 88 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode), 89 GET_MODE_NAME (mode)); 90 91 return true; 92 } 93 94 95 /* Return the smallest scalar part of STMT. 96 This is used to determine the vectype of the stmt. We generally set the 97 vectype according to the type of the result (lhs). For stmts whose 98 result-type is different than the type of the arguments (e.g., demotion, 99 promotion), vectype will be reset appropriately (later). Note that we have 100 to visit the smallest datatype in this function, because that determines the 101 VF. If the smallest datatype in the loop is present only as the rhs of a 102 promotion operation - we'd miss it. 103 Such a case, where a variable of this datatype does not appear in the lhs 104 anywhere in the loop, can only occur if it's an invariant: e.g.: 105 'int_x = (int) short_inv', which we'd expect to have been optimized away by 106 invariant motion. However, we cannot rely on invariant motion to always 107 take invariants out of the loop, and so in the case of promotion we also 108 have to check the rhs. 109 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding 110 types. */ 111 112 tree 113 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit, 114 HOST_WIDE_INT *rhs_size_unit) 115 { 116 tree scalar_type = gimple_expr_type (stmt); 117 HOST_WIDE_INT lhs, rhs; 118 119 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type)); 120 121 if (is_gimple_assign (stmt) 122 && (gimple_assign_cast_p (stmt) 123 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR 124 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR 125 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR)) 126 { 127 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt)); 128 129 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type)); 130 if (rhs < lhs) 131 scalar_type = rhs_type; 132 } 133 134 *lhs_size_unit = lhs; 135 *rhs_size_unit = rhs; 136 return scalar_type; 137 } 138 139 140 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be 141 tested at run-time. Return TRUE if DDR was successfully inserted. 142 Return false if versioning is not supported. */ 143 144 static bool 145 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo) 146 { 147 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 148 149 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0) 150 return false; 151 152 if (dump_enabled_p ()) 153 { 154 dump_printf_loc (MSG_NOTE, vect_location, 155 "mark for run-time aliasing test between "); 156 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr))); 157 dump_printf (MSG_NOTE, " and "); 158 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr))); 159 dump_printf (MSG_NOTE, "\n"); 160 } 161 162 if (optimize_loop_nest_for_size_p (loop)) 163 { 164 if (dump_enabled_p ()) 165 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 166 "versioning not supported when optimizing" 167 " for size.\n"); 168 return false; 169 } 170 171 /* FORNOW: We don't support versioning with outer-loop vectorization. */ 172 if (loop->inner) 173 { 174 if (dump_enabled_p ()) 175 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 176 "versioning not yet supported for outer-loops.\n"); 177 return false; 178 } 179 180 /* FORNOW: We don't support creating runtime alias tests for non-constant 181 step. */ 182 if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST 183 || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST) 184 { 185 if (dump_enabled_p ()) 186 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 187 "versioning not yet supported for non-constant " 188 "step\n"); 189 return false; 190 } 191 192 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr); 193 return true; 194 } 195 196 197 /* Function vect_analyze_data_ref_dependence. 198 199 Return TRUE if there (might) exist a dependence between a memory-reference 200 DRA and a memory-reference DRB. When versioning for alias may check a 201 dependence at run-time, return FALSE. Adjust *MAX_VF according to 202 the data dependence. */ 203 204 static bool 205 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr, 206 loop_vec_info loop_vinfo, int *max_vf) 207 { 208 unsigned int i; 209 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 210 struct data_reference *dra = DDR_A (ddr); 211 struct data_reference *drb = DDR_B (ddr); 212 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra)); 213 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb)); 214 lambda_vector dist_v; 215 unsigned int loop_depth; 216 217 /* In loop analysis all data references should be vectorizable. */ 218 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a) 219 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b)) 220 gcc_unreachable (); 221 222 /* Independent data accesses. */ 223 if (DDR_ARE_DEPENDENT (ddr) == chrec_known) 224 return false; 225 226 if (dra == drb 227 || (DR_IS_READ (dra) && DR_IS_READ (drb))) 228 return false; 229 230 /* Even if we have an anti-dependence then, as the vectorized loop covers at 231 least two scalar iterations, there is always also a true dependence. 232 As the vectorizer does not re-order loads and stores we can ignore 233 the anti-dependence if TBAA can disambiguate both DRs similar to the 234 case with known negative distance anti-dependences (positive 235 distance anti-dependences would violate TBAA constraints). */ 236 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb)) 237 || (DR_IS_WRITE (dra) && DR_IS_READ (drb))) 238 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)), 239 get_alias_set (DR_REF (drb)))) 240 return false; 241 242 /* Unknown data dependence. */ 243 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know) 244 { 245 /* If user asserted safelen consecutive iterations can be 246 executed concurrently, assume independence. */ 247 if (loop->safelen >= 2) 248 { 249 if (loop->safelen < *max_vf) 250 *max_vf = loop->safelen; 251 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false; 252 return false; 253 } 254 255 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a) 256 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b)) 257 { 258 if (dump_enabled_p ()) 259 { 260 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 261 "versioning for alias not supported for: " 262 "can't determine dependence between "); 263 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 264 DR_REF (dra)); 265 dump_printf (MSG_MISSED_OPTIMIZATION, " and "); 266 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 267 DR_REF (drb)); 268 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 269 } 270 return true; 271 } 272 273 if (dump_enabled_p ()) 274 { 275 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 276 "versioning for alias required: " 277 "can't determine dependence between "); 278 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 279 DR_REF (dra)); 280 dump_printf (MSG_MISSED_OPTIMIZATION, " and "); 281 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 282 DR_REF (drb)); 283 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 284 } 285 286 /* Add to list of ddrs that need to be tested at run-time. */ 287 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo); 288 } 289 290 /* Known data dependence. */ 291 if (DDR_NUM_DIST_VECTS (ddr) == 0) 292 { 293 /* If user asserted safelen consecutive iterations can be 294 executed concurrently, assume independence. */ 295 if (loop->safelen >= 2) 296 { 297 if (loop->safelen < *max_vf) 298 *max_vf = loop->safelen; 299 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false; 300 return false; 301 } 302 303 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a) 304 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b)) 305 { 306 if (dump_enabled_p ()) 307 { 308 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 309 "versioning for alias not supported for: " 310 "bad dist vector for "); 311 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 312 DR_REF (dra)); 313 dump_printf (MSG_MISSED_OPTIMIZATION, " and "); 314 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 315 DR_REF (drb)); 316 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 317 } 318 return true; 319 } 320 321 if (dump_enabled_p ()) 322 { 323 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 324 "versioning for alias required: " 325 "bad dist vector for "); 326 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra)); 327 dump_printf (MSG_MISSED_OPTIMIZATION, " and "); 328 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb)); 329 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 330 } 331 /* Add to list of ddrs that need to be tested at run-time. */ 332 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo); 333 } 334 335 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr)); 336 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v) 337 { 338 int dist = dist_v[loop_depth]; 339 340 if (dump_enabled_p ()) 341 dump_printf_loc (MSG_NOTE, vect_location, 342 "dependence distance = %d.\n", dist); 343 344 if (dist == 0) 345 { 346 if (dump_enabled_p ()) 347 { 348 dump_printf_loc (MSG_NOTE, vect_location, 349 "dependence distance == 0 between "); 350 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra)); 351 dump_printf (MSG_NOTE, " and "); 352 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb)); 353 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 354 } 355 356 /* When we perform grouped accesses and perform implicit CSE 357 by detecting equal accesses and doing disambiguation with 358 runtime alias tests like for 359 .. = a[i]; 360 .. = a[i+1]; 361 a[i] = ..; 362 a[i+1] = ..; 363 *p = ..; 364 .. = a[i]; 365 .. = a[i+1]; 366 where we will end up loading { a[i], a[i+1] } once, make 367 sure that inserting group loads before the first load and 368 stores after the last store will do the right thing. 369 Similar for groups like 370 a[i] = ...; 371 ... = a[i]; 372 a[i+1] = ...; 373 where loads from the group interleave with the store. */ 374 if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a) 375 || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b)) 376 { 377 gimple *earlier_stmt; 378 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb)); 379 if (DR_IS_WRITE 380 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt)))) 381 { 382 if (dump_enabled_p ()) 383 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 384 "READ_WRITE dependence in interleaving." 385 "\n"); 386 return true; 387 } 388 } 389 390 continue; 391 } 392 393 if (dist > 0 && DDR_REVERSED_P (ddr)) 394 { 395 /* If DDR_REVERSED_P the order of the data-refs in DDR was 396 reversed (to make distance vector positive), and the actual 397 distance is negative. */ 398 if (dump_enabled_p ()) 399 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 400 "dependence distance negative.\n"); 401 /* Record a negative dependence distance to later limit the 402 amount of stmt copying / unrolling we can perform. 403 Only need to handle read-after-write dependence. */ 404 if (DR_IS_READ (drb) 405 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0 406 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist)) 407 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist; 408 continue; 409 } 410 411 if (abs (dist) >= 2 412 && abs (dist) < *max_vf) 413 { 414 /* The dependence distance requires reduction of the maximal 415 vectorization factor. */ 416 *max_vf = abs (dist); 417 if (dump_enabled_p ()) 418 dump_printf_loc (MSG_NOTE, vect_location, 419 "adjusting maximal vectorization factor to %i\n", 420 *max_vf); 421 } 422 423 if (abs (dist) >= *max_vf) 424 { 425 /* Dependence distance does not create dependence, as far as 426 vectorization is concerned, in this case. */ 427 if (dump_enabled_p ()) 428 dump_printf_loc (MSG_NOTE, vect_location, 429 "dependence distance >= VF.\n"); 430 continue; 431 } 432 433 if (dump_enabled_p ()) 434 { 435 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 436 "not vectorized, possible dependence " 437 "between data-refs "); 438 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra)); 439 dump_printf (MSG_NOTE, " and "); 440 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb)); 441 dump_printf (MSG_NOTE, "\n"); 442 } 443 444 return true; 445 } 446 447 return false; 448 } 449 450 /* Function vect_analyze_data_ref_dependences. 451 452 Examine all the data references in the loop, and make sure there do not 453 exist any data dependences between them. Set *MAX_VF according to 454 the maximum vectorization factor the data dependences allow. */ 455 456 bool 457 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf) 458 { 459 unsigned int i; 460 struct data_dependence_relation *ddr; 461 462 if (dump_enabled_p ()) 463 dump_printf_loc (MSG_NOTE, vect_location, 464 "=== vect_analyze_data_ref_dependences ===\n"); 465 466 LOOP_VINFO_DDRS (loop_vinfo) 467 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length () 468 * LOOP_VINFO_DATAREFS (loop_vinfo).length ()); 469 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true; 470 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo), 471 &LOOP_VINFO_DDRS (loop_vinfo), 472 LOOP_VINFO_LOOP_NEST (loop_vinfo), true)) 473 return false; 474 475 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr) 476 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf)) 477 return false; 478 479 return true; 480 } 481 482 483 /* Function vect_slp_analyze_data_ref_dependence. 484 485 Return TRUE if there (might) exist a dependence between a memory-reference 486 DRA and a memory-reference DRB. When versioning for alias may check a 487 dependence at run-time, return FALSE. Adjust *MAX_VF according to 488 the data dependence. */ 489 490 static bool 491 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr) 492 { 493 struct data_reference *dra = DDR_A (ddr); 494 struct data_reference *drb = DDR_B (ddr); 495 496 /* We need to check dependences of statements marked as unvectorizable 497 as well, they still can prohibit vectorization. */ 498 499 /* Independent data accesses. */ 500 if (DDR_ARE_DEPENDENT (ddr) == chrec_known) 501 return false; 502 503 if (dra == drb) 504 return false; 505 506 /* Read-read is OK. */ 507 if (DR_IS_READ (dra) && DR_IS_READ (drb)) 508 return false; 509 510 /* If dra and drb are part of the same interleaving chain consider 511 them independent. */ 512 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra))) 513 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra))) 514 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb))))) 515 return false; 516 517 /* Unknown data dependence. */ 518 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know) 519 { 520 if (dump_enabled_p ()) 521 { 522 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 523 "can't determine dependence between "); 524 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra)); 525 dump_printf (MSG_MISSED_OPTIMIZATION, " and "); 526 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb)); 527 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 528 } 529 } 530 else if (dump_enabled_p ()) 531 { 532 dump_printf_loc (MSG_NOTE, vect_location, 533 "determined dependence between "); 534 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra)); 535 dump_printf (MSG_NOTE, " and "); 536 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb)); 537 dump_printf (MSG_NOTE, "\n"); 538 } 539 540 return true; 541 } 542 543 544 /* Analyze dependences involved in the transform of SLP NODE. STORES 545 contain the vector of scalar stores of this instance if we are 546 disambiguating the loads. */ 547 548 static bool 549 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node, 550 vec<gimple *> stores, gimple *last_store) 551 { 552 /* This walks over all stmts involved in the SLP load/store done 553 in NODE verifying we can sink them up to the last stmt in the 554 group. */ 555 gimple *last_access = vect_find_last_scalar_stmt_in_slp (node); 556 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k) 557 { 558 gimple *access = SLP_TREE_SCALAR_STMTS (node)[k]; 559 if (access == last_access) 560 continue; 561 data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access)); 562 for (gimple_stmt_iterator gsi = gsi_for_stmt (access); 563 gsi_stmt (gsi) != last_access; gsi_next (&gsi)) 564 { 565 gimple *stmt = gsi_stmt (gsi); 566 if (! gimple_vuse (stmt) 567 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt))) 568 continue; 569 570 /* If we couldn't record a (single) data reference for this 571 stmt we have to give up. */ 572 /* ??? Here and below if dependence analysis fails we can resort 573 to the alias oracle which can handle more kinds of stmts. */ 574 data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)); 575 if (!dr_b) 576 return false; 577 578 /* If we run into a store of this same instance (we've just 579 marked those) then delay dependence checking until we run 580 into the last store because this is where it will have 581 been sunk to (and we verify if we can do that as well). */ 582 if (gimple_visited_p (stmt)) 583 { 584 if (stmt != last_store) 585 continue; 586 unsigned i; 587 gimple *store; 588 FOR_EACH_VEC_ELT (stores, i, store) 589 { 590 data_reference *store_dr 591 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store)); 592 ddr_p ddr = initialize_data_dependence_relation 593 (dr_a, store_dr, vNULL); 594 if (vect_slp_analyze_data_ref_dependence (ddr)) 595 { 596 free_dependence_relation (ddr); 597 return false; 598 } 599 free_dependence_relation (ddr); 600 } 601 } 602 603 ddr_p ddr = initialize_data_dependence_relation (dr_a, dr_b, vNULL); 604 if (vect_slp_analyze_data_ref_dependence (ddr)) 605 { 606 free_dependence_relation (ddr); 607 return false; 608 } 609 free_dependence_relation (ddr); 610 } 611 } 612 return true; 613 } 614 615 616 /* Function vect_analyze_data_ref_dependences. 617 618 Examine all the data references in the basic-block, and make sure there 619 do not exist any data dependences between them. Set *MAX_VF according to 620 the maximum vectorization factor the data dependences allow. */ 621 622 bool 623 vect_slp_analyze_instance_dependence (slp_instance instance) 624 { 625 if (dump_enabled_p ()) 626 dump_printf_loc (MSG_NOTE, vect_location, 627 "=== vect_slp_analyze_instance_dependence ===\n"); 628 629 /* The stores of this instance are at the root of the SLP tree. */ 630 slp_tree store = SLP_INSTANCE_TREE (instance); 631 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0]))) 632 store = NULL; 633 634 /* Verify we can sink stores to the vectorized stmt insert location. */ 635 gimple *last_store = NULL; 636 if (store) 637 { 638 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL)) 639 return false; 640 641 /* Mark stores in this instance and remember the last one. */ 642 last_store = vect_find_last_scalar_stmt_in_slp (store); 643 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k) 644 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true); 645 } 646 647 bool res = true; 648 649 /* Verify we can sink loads to the vectorized stmt insert location, 650 special-casing stores of this instance. */ 651 slp_tree load; 652 unsigned int i; 653 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load) 654 if (! vect_slp_analyze_node_dependences (instance, load, 655 store 656 ? SLP_TREE_SCALAR_STMTS (store) 657 : vNULL, last_store)) 658 { 659 res = false; 660 break; 661 } 662 663 /* Unset the visited flag. */ 664 if (store) 665 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k) 666 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false); 667 668 return res; 669 } 670 671 /* Function vect_compute_data_ref_alignment 672 673 Compute the misalignment of the data reference DR. 674 675 Output: 676 1. If during the misalignment computation it is found that the data reference 677 cannot be vectorized then false is returned. 678 2. DR_MISALIGNMENT (DR) is defined. 679 680 FOR NOW: No analysis is actually performed. Misalignment is calculated 681 only for trivial cases. TODO. */ 682 683 bool 684 vect_compute_data_ref_alignment (struct data_reference *dr) 685 { 686 gimple *stmt = DR_STMT (dr); 687 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 688 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 689 struct loop *loop = NULL; 690 tree ref = DR_REF (dr); 691 tree vectype; 692 tree base, base_addr; 693 tree misalign = NULL_TREE; 694 tree aligned_to; 695 tree step; 696 unsigned HOST_WIDE_INT alignment; 697 698 if (dump_enabled_p ()) 699 dump_printf_loc (MSG_NOTE, vect_location, 700 "vect_compute_data_ref_alignment:\n"); 701 702 if (loop_vinfo) 703 loop = LOOP_VINFO_LOOP (loop_vinfo); 704 705 /* Initialize misalignment to unknown. */ 706 SET_DR_MISALIGNMENT (dr, -1); 707 708 if (tree_fits_shwi_p (DR_STEP (dr))) 709 misalign = DR_INIT (dr); 710 aligned_to = DR_ALIGNED_TO (dr); 711 base_addr = DR_BASE_ADDRESS (dr); 712 vectype = STMT_VINFO_VECTYPE (stmt_info); 713 714 /* In case the dataref is in an inner-loop of the loop that is being 715 vectorized (LOOP), we use the base and misalignment information 716 relative to the outer-loop (LOOP). This is ok only if the misalignment 717 stays the same throughout the execution of the inner-loop, which is why 718 we have to check that the stride of the dataref in the inner-loop evenly 719 divides by the vector size. */ 720 if (loop && nested_in_vect_loop_p (loop, stmt)) 721 { 722 tree step = DR_STEP (dr); 723 724 if (tree_fits_shwi_p (step) 725 && tree_to_shwi (step) % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0) 726 { 727 if (dump_enabled_p ()) 728 dump_printf_loc (MSG_NOTE, vect_location, 729 "inner step divides the vector-size.\n"); 730 misalign = STMT_VINFO_DR_INIT (stmt_info); 731 aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info); 732 base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info); 733 } 734 else 735 { 736 if (dump_enabled_p ()) 737 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 738 "inner step doesn't divide the vector-size.\n"); 739 misalign = NULL_TREE; 740 } 741 } 742 743 /* Similarly we can only use base and misalignment information relative to 744 an innermost loop if the misalignment stays the same throughout the 745 execution of the loop. As above, this is the case if the stride of 746 the dataref evenly divides by the vector size. */ 747 else 748 { 749 tree step = DR_STEP (dr); 750 unsigned vf = loop ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1; 751 752 if (tree_fits_shwi_p (step) 753 && ((tree_to_shwi (step) * vf) 754 % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0)) 755 { 756 if (dump_enabled_p ()) 757 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 758 "step doesn't divide the vector-size.\n"); 759 misalign = NULL_TREE; 760 } 761 } 762 763 /* To look at alignment of the base we have to preserve an inner MEM_REF 764 as that carries alignment information of the actual access. */ 765 base = ref; 766 while (handled_component_p (base)) 767 base = TREE_OPERAND (base, 0); 768 unsigned int base_alignment = 0; 769 unsigned HOST_WIDE_INT base_bitpos; 770 get_object_alignment_1 (base, &base_alignment, &base_bitpos); 771 /* As data-ref analysis strips the MEM_REF down to its base operand 772 to form DR_BASE_ADDRESS and adds the offset to DR_INIT we have to 773 adjust things to make base_alignment valid as the alignment of 774 DR_BASE_ADDRESS. */ 775 if (TREE_CODE (base) == MEM_REF) 776 { 777 /* Note all this only works if DR_BASE_ADDRESS is the same as 778 MEM_REF operand zero, otherwise DR/SCEV analysis might have factored 779 in other offsets. We need to rework DR to compute the alingment 780 of DR_BASE_ADDRESS as long as all information is still available. */ 781 if (operand_equal_p (TREE_OPERAND (base, 0), base_addr, 0)) 782 { 783 base_bitpos -= mem_ref_offset (base).to_short_addr () * BITS_PER_UNIT; 784 base_bitpos &= (base_alignment - 1); 785 } 786 else 787 base_bitpos = BITS_PER_UNIT; 788 } 789 if (base_bitpos != 0) 790 base_alignment = base_bitpos & -base_bitpos; 791 /* Also look at the alignment of the base address DR analysis 792 computed. */ 793 unsigned int base_addr_alignment = get_pointer_alignment (base_addr); 794 if (base_addr_alignment > base_alignment) 795 base_alignment = base_addr_alignment; 796 797 if (base_alignment >= TYPE_ALIGN (TREE_TYPE (vectype))) 798 DR_VECT_AUX (dr)->base_element_aligned = true; 799 800 alignment = TYPE_ALIGN_UNIT (vectype); 801 802 if ((compare_tree_int (aligned_to, alignment) < 0) 803 || !misalign) 804 { 805 if (dump_enabled_p ()) 806 { 807 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 808 "Unknown alignment for access: "); 809 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref); 810 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 811 } 812 return true; 813 } 814 815 if (base_alignment < TYPE_ALIGN (vectype)) 816 { 817 base = base_addr; 818 if (TREE_CODE (base) == ADDR_EXPR) 819 base = TREE_OPERAND (base, 0); 820 if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype))) 821 { 822 if (dump_enabled_p ()) 823 { 824 dump_printf_loc (MSG_NOTE, vect_location, 825 "can't force alignment of ref: "); 826 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref); 827 dump_printf (MSG_NOTE, "\n"); 828 } 829 return true; 830 } 831 832 /* Force the alignment of the decl. 833 NOTE: This is the only change to the code we make during 834 the analysis phase, before deciding to vectorize the loop. */ 835 if (dump_enabled_p ()) 836 { 837 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of "); 838 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref); 839 dump_printf (MSG_NOTE, "\n"); 840 } 841 842 DR_VECT_AUX (dr)->base_decl = base; 843 DR_VECT_AUX (dr)->base_misaligned = true; 844 DR_VECT_AUX (dr)->base_element_aligned = true; 845 } 846 847 if (loop && nested_in_vect_loop_p (loop, stmt)) 848 step = STMT_VINFO_DR_STEP (stmt_info); 849 else 850 step = DR_STEP (dr); 851 /* If this is a backward running DR then first access in the larger 852 vectype actually is N-1 elements before the address in the DR. 853 Adjust misalign accordingly. */ 854 if (tree_int_cst_sgn (step) < 0) 855 { 856 tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1); 857 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type, 858 otherwise we wouldn't be here. */ 859 offset = fold_build2 (MULT_EXPR, ssizetype, offset, step); 860 /* PLUS because STEP was negative. */ 861 misalign = size_binop (PLUS_EXPR, misalign, offset); 862 } 863 864 SET_DR_MISALIGNMENT (dr, 865 wi::mod_floor (misalign, alignment, SIGNED).to_uhwi ()); 866 867 if (dump_enabled_p ()) 868 { 869 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 870 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr)); 871 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref); 872 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 873 } 874 875 return true; 876 } 877 878 879 /* Function vect_update_misalignment_for_peel 880 881 DR - the data reference whose misalignment is to be adjusted. 882 DR_PEEL - the data reference whose misalignment is being made 883 zero in the vector loop by the peel. 884 NPEEL - the number of iterations in the peel loop if the misalignment 885 of DR_PEEL is known at compile time. */ 886 887 static void 888 vect_update_misalignment_for_peel (struct data_reference *dr, 889 struct data_reference *dr_peel, int npeel) 890 { 891 unsigned int i; 892 vec<dr_p> same_align_drs; 893 struct data_reference *current_dr; 894 int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr)))); 895 int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel)))); 896 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr)); 897 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel)); 898 899 /* For interleaved data accesses the step in the loop must be multiplied by 900 the size of the interleaving group. */ 901 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) 902 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info))); 903 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info)) 904 dr_peel_size *= GROUP_SIZE (peel_stmt_info); 905 906 /* It can be assumed that the data refs with the same alignment as dr_peel 907 are aligned in the vector loop. */ 908 same_align_drs 909 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel))); 910 FOR_EACH_VEC_ELT (same_align_drs, i, current_dr) 911 { 912 if (current_dr != dr) 913 continue; 914 gcc_assert (DR_MISALIGNMENT (dr) / dr_size == 915 DR_MISALIGNMENT (dr_peel) / dr_peel_size); 916 SET_DR_MISALIGNMENT (dr, 0); 917 return; 918 } 919 920 if (known_alignment_for_access_p (dr) 921 && known_alignment_for_access_p (dr_peel)) 922 { 923 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0; 924 int misal = DR_MISALIGNMENT (dr); 925 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 926 misal += negative ? -npeel * dr_size : npeel * dr_size; 927 misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1; 928 SET_DR_MISALIGNMENT (dr, misal); 929 return; 930 } 931 932 if (dump_enabled_p ()) 933 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n"); 934 SET_DR_MISALIGNMENT (dr, -1); 935 } 936 937 938 /* Function verify_data_ref_alignment 939 940 Return TRUE if DR can be handled with respect to alignment. */ 941 942 static bool 943 verify_data_ref_alignment (data_reference_p dr) 944 { 945 enum dr_alignment_support supportable_dr_alignment 946 = vect_supportable_dr_alignment (dr, false); 947 if (!supportable_dr_alignment) 948 { 949 if (dump_enabled_p ()) 950 { 951 if (DR_IS_READ (dr)) 952 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 953 "not vectorized: unsupported unaligned load."); 954 else 955 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 956 "not vectorized: unsupported unaligned " 957 "store."); 958 959 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 960 DR_REF (dr)); 961 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 962 } 963 return false; 964 } 965 966 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ()) 967 dump_printf_loc (MSG_NOTE, vect_location, 968 "Vectorizing an unaligned access.\n"); 969 970 return true; 971 } 972 973 /* Function vect_verify_datarefs_alignment 974 975 Return TRUE if all data references in the loop can be 976 handled with respect to alignment. */ 977 978 bool 979 vect_verify_datarefs_alignment (loop_vec_info vinfo) 980 { 981 vec<data_reference_p> datarefs = vinfo->datarefs; 982 struct data_reference *dr; 983 unsigned int i; 984 985 FOR_EACH_VEC_ELT (datarefs, i, dr) 986 { 987 gimple *stmt = DR_STMT (dr); 988 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 989 990 if (!STMT_VINFO_RELEVANT_P (stmt_info)) 991 continue; 992 993 /* For interleaving, only the alignment of the first access matters. */ 994 if (STMT_VINFO_GROUPED_ACCESS (stmt_info) 995 && GROUP_FIRST_ELEMENT (stmt_info) != stmt) 996 continue; 997 998 /* Strided accesses perform only component accesses, alignment is 999 irrelevant for them. */ 1000 if (STMT_VINFO_STRIDED_P (stmt_info) 1001 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)) 1002 continue; 1003 1004 if (! verify_data_ref_alignment (dr)) 1005 return false; 1006 } 1007 1008 return true; 1009 } 1010 1011 /* Given an memory reference EXP return whether its alignment is less 1012 than its size. */ 1013 1014 static bool 1015 not_size_aligned (tree exp) 1016 { 1017 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp)))) 1018 return true; 1019 1020 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp))) 1021 > get_object_alignment (exp)); 1022 } 1023 1024 /* Function vector_alignment_reachable_p 1025 1026 Return true if vector alignment for DR is reachable by peeling 1027 a few loop iterations. Return false otherwise. */ 1028 1029 static bool 1030 vector_alignment_reachable_p (struct data_reference *dr) 1031 { 1032 gimple *stmt = DR_STMT (dr); 1033 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 1034 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 1035 1036 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) 1037 { 1038 /* For interleaved access we peel only if number of iterations in 1039 the prolog loop ({VF - misalignment}), is a multiple of the 1040 number of the interleaved accesses. */ 1041 int elem_size, mis_in_elements; 1042 int nelements = TYPE_VECTOR_SUBPARTS (vectype); 1043 1044 /* FORNOW: handle only known alignment. */ 1045 if (!known_alignment_for_access_p (dr)) 1046 return false; 1047 1048 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements; 1049 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size; 1050 1051 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info)) 1052 return false; 1053 } 1054 1055 /* If misalignment is known at the compile time then allow peeling 1056 only if natural alignment is reachable through peeling. */ 1057 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr)) 1058 { 1059 HOST_WIDE_INT elmsize = 1060 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype))); 1061 if (dump_enabled_p ()) 1062 { 1063 dump_printf_loc (MSG_NOTE, vect_location, 1064 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize); 1065 dump_printf (MSG_NOTE, 1066 ". misalignment = %d.\n", DR_MISALIGNMENT (dr)); 1067 } 1068 if (DR_MISALIGNMENT (dr) % elmsize) 1069 { 1070 if (dump_enabled_p ()) 1071 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1072 "data size does not divide the misalignment.\n"); 1073 return false; 1074 } 1075 } 1076 1077 if (!known_alignment_for_access_p (dr)) 1078 { 1079 tree type = TREE_TYPE (DR_REF (dr)); 1080 bool is_packed = not_size_aligned (DR_REF (dr)); 1081 if (dump_enabled_p ()) 1082 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1083 "Unknown misalignment, %snaturally aligned\n", 1084 is_packed ? "not " : ""); 1085 return targetm.vectorize.vector_alignment_reachable (type, is_packed); 1086 } 1087 1088 return true; 1089 } 1090 1091 1092 /* Calculate the cost of the memory access represented by DR. */ 1093 1094 static void 1095 vect_get_data_access_cost (struct data_reference *dr, 1096 unsigned int *inside_cost, 1097 unsigned int *outside_cost, 1098 stmt_vector_for_cost *body_cost_vec) 1099 { 1100 gimple *stmt = DR_STMT (dr); 1101 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 1102 int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)); 1103 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 1104 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 1105 int ncopies = vf / nunits; 1106 1107 if (DR_IS_READ (dr)) 1108 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost, 1109 NULL, body_cost_vec, false); 1110 else 1111 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec); 1112 1113 if (dump_enabled_p ()) 1114 dump_printf_loc (MSG_NOTE, vect_location, 1115 "vect_get_data_access_cost: inside_cost = %d, " 1116 "outside_cost = %d.\n", *inside_cost, *outside_cost); 1117 } 1118 1119 1120 typedef struct _vect_peel_info 1121 { 1122 int npeel; 1123 struct data_reference *dr; 1124 unsigned int count; 1125 } *vect_peel_info; 1126 1127 typedef struct _vect_peel_extended_info 1128 { 1129 struct _vect_peel_info peel_info; 1130 unsigned int inside_cost; 1131 unsigned int outside_cost; 1132 stmt_vector_for_cost body_cost_vec; 1133 } *vect_peel_extended_info; 1134 1135 1136 /* Peeling hashtable helpers. */ 1137 1138 struct peel_info_hasher : free_ptr_hash <_vect_peel_info> 1139 { 1140 static inline hashval_t hash (const _vect_peel_info *); 1141 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *); 1142 }; 1143 1144 inline hashval_t 1145 peel_info_hasher::hash (const _vect_peel_info *peel_info) 1146 { 1147 return (hashval_t) peel_info->npeel; 1148 } 1149 1150 inline bool 1151 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b) 1152 { 1153 return (a->npeel == b->npeel); 1154 } 1155 1156 1157 /* Insert DR into peeling hash table with NPEEL as key. */ 1158 1159 static void 1160 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab, 1161 loop_vec_info loop_vinfo, struct data_reference *dr, 1162 int npeel) 1163 { 1164 struct _vect_peel_info elem, *slot; 1165 _vect_peel_info **new_slot; 1166 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true); 1167 1168 elem.npeel = npeel; 1169 slot = peeling_htab->find (&elem); 1170 if (slot) 1171 slot->count++; 1172 else 1173 { 1174 slot = XNEW (struct _vect_peel_info); 1175 slot->npeel = npeel; 1176 slot->dr = dr; 1177 slot->count = 1; 1178 new_slot = peeling_htab->find_slot (slot, INSERT); 1179 *new_slot = slot; 1180 } 1181 1182 if (!supportable_dr_alignment 1183 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo))) 1184 slot->count += VECT_MAX_COST; 1185 } 1186 1187 1188 /* Traverse peeling hash table to find peeling option that aligns maximum 1189 number of data accesses. */ 1190 1191 int 1192 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot, 1193 _vect_peel_extended_info *max) 1194 { 1195 vect_peel_info elem = *slot; 1196 1197 if (elem->count > max->peel_info.count 1198 || (elem->count == max->peel_info.count 1199 && max->peel_info.npeel > elem->npeel)) 1200 { 1201 max->peel_info.npeel = elem->npeel; 1202 max->peel_info.count = elem->count; 1203 max->peel_info.dr = elem->dr; 1204 } 1205 1206 return 1; 1207 } 1208 1209 1210 /* Traverse peeling hash table and calculate cost for each peeling option. 1211 Find the one with the lowest cost. */ 1212 1213 int 1214 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot, 1215 _vect_peel_extended_info *min) 1216 { 1217 vect_peel_info elem = *slot; 1218 int save_misalignment, dummy; 1219 unsigned int inside_cost = 0, outside_cost = 0, i; 1220 gimple *stmt = DR_STMT (elem->dr); 1221 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 1222 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 1223 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo); 1224 struct data_reference *dr; 1225 stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec; 1226 1227 prologue_cost_vec.create (2); 1228 body_cost_vec.create (2); 1229 epilogue_cost_vec.create (2); 1230 1231 FOR_EACH_VEC_ELT (datarefs, i, dr) 1232 { 1233 stmt = DR_STMT (dr); 1234 stmt_info = vinfo_for_stmt (stmt); 1235 /* For interleaving, only the alignment of the first access 1236 matters. */ 1237 if (STMT_VINFO_GROUPED_ACCESS (stmt_info) 1238 && GROUP_FIRST_ELEMENT (stmt_info) != stmt) 1239 continue; 1240 1241 /* Strided accesses perform only component accesses, alignment is 1242 irrelevant for them. */ 1243 if (STMT_VINFO_STRIDED_P (stmt_info) 1244 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)) 1245 continue; 1246 1247 save_misalignment = DR_MISALIGNMENT (dr); 1248 vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel); 1249 vect_get_data_access_cost (dr, &inside_cost, &outside_cost, 1250 &body_cost_vec); 1251 SET_DR_MISALIGNMENT (dr, save_misalignment); 1252 } 1253 1254 outside_cost += vect_get_known_peeling_cost 1255 (loop_vinfo, elem->npeel, &dummy, 1256 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), 1257 &prologue_cost_vec, &epilogue_cost_vec); 1258 1259 /* Prologue and epilogue costs are added to the target model later. 1260 These costs depend only on the scalar iteration cost, the 1261 number of peeling iterations finally chosen, and the number of 1262 misaligned statements. So discard the information found here. */ 1263 prologue_cost_vec.release (); 1264 epilogue_cost_vec.release (); 1265 1266 if (inside_cost < min->inside_cost 1267 || (inside_cost == min->inside_cost && outside_cost < min->outside_cost)) 1268 { 1269 min->inside_cost = inside_cost; 1270 min->outside_cost = outside_cost; 1271 min->body_cost_vec.release (); 1272 min->body_cost_vec = body_cost_vec; 1273 min->peel_info.dr = elem->dr; 1274 min->peel_info.npeel = elem->npeel; 1275 } 1276 else 1277 body_cost_vec.release (); 1278 1279 return 1; 1280 } 1281 1282 1283 /* Choose best peeling option by traversing peeling hash table and either 1284 choosing an option with the lowest cost (if cost model is enabled) or the 1285 option that aligns as many accesses as possible. */ 1286 1287 static struct data_reference * 1288 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab, 1289 loop_vec_info loop_vinfo, 1290 unsigned int *npeel, 1291 stmt_vector_for_cost *body_cost_vec) 1292 { 1293 struct _vect_peel_extended_info res; 1294 1295 res.peel_info.dr = NULL; 1296 res.body_cost_vec = stmt_vector_for_cost (); 1297 1298 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo))) 1299 { 1300 res.inside_cost = INT_MAX; 1301 res.outside_cost = INT_MAX; 1302 peeling_htab->traverse <_vect_peel_extended_info *, 1303 vect_peeling_hash_get_lowest_cost> (&res); 1304 } 1305 else 1306 { 1307 res.peel_info.count = 0; 1308 peeling_htab->traverse <_vect_peel_extended_info *, 1309 vect_peeling_hash_get_most_frequent> (&res); 1310 } 1311 1312 *npeel = res.peel_info.npeel; 1313 *body_cost_vec = res.body_cost_vec; 1314 return res.peel_info.dr; 1315 } 1316 1317 1318 /* Function vect_enhance_data_refs_alignment 1319 1320 This pass will use loop versioning and loop peeling in order to enhance 1321 the alignment of data references in the loop. 1322 1323 FOR NOW: we assume that whatever versioning/peeling takes place, only the 1324 original loop is to be vectorized. Any other loops that are created by 1325 the transformations performed in this pass - are not supposed to be 1326 vectorized. This restriction will be relaxed. 1327 1328 This pass will require a cost model to guide it whether to apply peeling 1329 or versioning or a combination of the two. For example, the scheme that 1330 intel uses when given a loop with several memory accesses, is as follows: 1331 choose one memory access ('p') which alignment you want to force by doing 1332 peeling. Then, either (1) generate a loop in which 'p' is aligned and all 1333 other accesses are not necessarily aligned, or (2) use loop versioning to 1334 generate one loop in which all accesses are aligned, and another loop in 1335 which only 'p' is necessarily aligned. 1336 1337 ("Automatic Intra-Register Vectorization for the Intel Architecture", 1338 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International 1339 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.) 1340 1341 Devising a cost model is the most critical aspect of this work. It will 1342 guide us on which access to peel for, whether to use loop versioning, how 1343 many versions to create, etc. The cost model will probably consist of 1344 generic considerations as well as target specific considerations (on 1345 powerpc for example, misaligned stores are more painful than misaligned 1346 loads). 1347 1348 Here are the general steps involved in alignment enhancements: 1349 1350 -- original loop, before alignment analysis: 1351 for (i=0; i<N; i++){ 1352 x = q[i]; # DR_MISALIGNMENT(q) = unknown 1353 p[i] = y; # DR_MISALIGNMENT(p) = unknown 1354 } 1355 1356 -- After vect_compute_data_refs_alignment: 1357 for (i=0; i<N; i++){ 1358 x = q[i]; # DR_MISALIGNMENT(q) = 3 1359 p[i] = y; # DR_MISALIGNMENT(p) = unknown 1360 } 1361 1362 -- Possibility 1: we do loop versioning: 1363 if (p is aligned) { 1364 for (i=0; i<N; i++){ # loop 1A 1365 x = q[i]; # DR_MISALIGNMENT(q) = 3 1366 p[i] = y; # DR_MISALIGNMENT(p) = 0 1367 } 1368 } 1369 else { 1370 for (i=0; i<N; i++){ # loop 1B 1371 x = q[i]; # DR_MISALIGNMENT(q) = 3 1372 p[i] = y; # DR_MISALIGNMENT(p) = unaligned 1373 } 1374 } 1375 1376 -- Possibility 2: we do loop peeling: 1377 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized). 1378 x = q[i]; 1379 p[i] = y; 1380 } 1381 for (i = 3; i < N; i++){ # loop 2A 1382 x = q[i]; # DR_MISALIGNMENT(q) = 0 1383 p[i] = y; # DR_MISALIGNMENT(p) = unknown 1384 } 1385 1386 -- Possibility 3: combination of loop peeling and versioning: 1387 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized). 1388 x = q[i]; 1389 p[i] = y; 1390 } 1391 if (p is aligned) { 1392 for (i = 3; i<N; i++){ # loop 3A 1393 x = q[i]; # DR_MISALIGNMENT(q) = 0 1394 p[i] = y; # DR_MISALIGNMENT(p) = 0 1395 } 1396 } 1397 else { 1398 for (i = 3; i<N; i++){ # loop 3B 1399 x = q[i]; # DR_MISALIGNMENT(q) = 0 1400 p[i] = y; # DR_MISALIGNMENT(p) = unaligned 1401 } 1402 } 1403 1404 These loops are later passed to loop_transform to be vectorized. The 1405 vectorizer will use the alignment information to guide the transformation 1406 (whether to generate regular loads/stores, or with special handling for 1407 misalignment). */ 1408 1409 bool 1410 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo) 1411 { 1412 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo); 1413 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 1414 enum dr_alignment_support supportable_dr_alignment; 1415 struct data_reference *dr0 = NULL, *first_store = NULL; 1416 struct data_reference *dr; 1417 unsigned int i, j; 1418 bool do_peeling = false; 1419 bool do_versioning = false; 1420 bool stat; 1421 gimple *stmt; 1422 stmt_vec_info stmt_info; 1423 unsigned int npeel = 0; 1424 bool all_misalignments_unknown = true; 1425 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 1426 unsigned possible_npeel_number = 1; 1427 tree vectype; 1428 unsigned int nelements, mis, same_align_drs_max = 0; 1429 stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost (); 1430 hash_table<peel_info_hasher> peeling_htab (1); 1431 1432 if (dump_enabled_p ()) 1433 dump_printf_loc (MSG_NOTE, vect_location, 1434 "=== vect_enhance_data_refs_alignment ===\n"); 1435 1436 /* Reset data so we can safely be called multiple times. */ 1437 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0); 1438 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0; 1439 1440 /* While cost model enhancements are expected in the future, the high level 1441 view of the code at this time is as follows: 1442 1443 A) If there is a misaligned access then see if peeling to align 1444 this access can make all data references satisfy 1445 vect_supportable_dr_alignment. If so, update data structures 1446 as needed and return true. 1447 1448 B) If peeling wasn't possible and there is a data reference with an 1449 unknown misalignment that does not satisfy vect_supportable_dr_alignment 1450 then see if loop versioning checks can be used to make all data 1451 references satisfy vect_supportable_dr_alignment. If so, update 1452 data structures as needed and return true. 1453 1454 C) If neither peeling nor versioning were successful then return false if 1455 any data reference does not satisfy vect_supportable_dr_alignment. 1456 1457 D) Return true (all data references satisfy vect_supportable_dr_alignment). 1458 1459 Note, Possibility 3 above (which is peeling and versioning together) is not 1460 being done at this time. */ 1461 1462 /* (1) Peeling to force alignment. */ 1463 1464 /* (1.1) Decide whether to perform peeling, and how many iterations to peel: 1465 Considerations: 1466 + How many accesses will become aligned due to the peeling 1467 - How many accesses will become unaligned due to the peeling, 1468 and the cost of misaligned accesses. 1469 - The cost of peeling (the extra runtime checks, the increase 1470 in code size). */ 1471 1472 FOR_EACH_VEC_ELT (datarefs, i, dr) 1473 { 1474 stmt = DR_STMT (dr); 1475 stmt_info = vinfo_for_stmt (stmt); 1476 1477 if (!STMT_VINFO_RELEVANT_P (stmt_info)) 1478 continue; 1479 1480 /* For interleaving, only the alignment of the first access 1481 matters. */ 1482 if (STMT_VINFO_GROUPED_ACCESS (stmt_info) 1483 && GROUP_FIRST_ELEMENT (stmt_info) != stmt) 1484 continue; 1485 1486 /* For invariant accesses there is nothing to enhance. */ 1487 if (integer_zerop (DR_STEP (dr))) 1488 continue; 1489 1490 /* Strided accesses perform only component accesses, alignment is 1491 irrelevant for them. */ 1492 if (STMT_VINFO_STRIDED_P (stmt_info) 1493 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)) 1494 continue; 1495 1496 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true); 1497 do_peeling = vector_alignment_reachable_p (dr); 1498 if (do_peeling) 1499 { 1500 if (known_alignment_for_access_p (dr)) 1501 { 1502 unsigned int npeel_tmp; 1503 bool negative = tree_int_cst_compare (DR_STEP (dr), 1504 size_zero_node) < 0; 1505 1506 /* Save info about DR in the hash table. */ 1507 vectype = STMT_VINFO_VECTYPE (stmt_info); 1508 nelements = TYPE_VECTOR_SUBPARTS (vectype); 1509 mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE ( 1510 TREE_TYPE (DR_REF (dr)))); 1511 npeel_tmp = (negative 1512 ? (mis - nelements) : (nelements - mis)) 1513 & (nelements - 1); 1514 1515 /* For multiple types, it is possible that the bigger type access 1516 will have more than one peeling option. E.g., a loop with two 1517 types: one of size (vector size / 4), and the other one of 1518 size (vector size / 8). Vectorization factor will 8. If both 1519 access are misaligned by 3, the first one needs one scalar 1520 iteration to be aligned, and the second one needs 5. But the 1521 first one will be aligned also by peeling 5 scalar 1522 iterations, and in that case both accesses will be aligned. 1523 Hence, except for the immediate peeling amount, we also want 1524 to try to add full vector size, while we don't exceed 1525 vectorization factor. 1526 We do this automtically for cost model, since we calculate cost 1527 for every peeling option. */ 1528 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo))) 1529 { 1530 if (STMT_SLP_TYPE (stmt_info)) 1531 possible_npeel_number 1532 = (vf * GROUP_SIZE (stmt_info)) / nelements; 1533 else 1534 possible_npeel_number = vf / nelements; 1535 } 1536 1537 /* Handle the aligned case. We may decide to align some other 1538 access, making DR unaligned. */ 1539 if (DR_MISALIGNMENT (dr) == 0) 1540 { 1541 npeel_tmp = 0; 1542 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo))) 1543 possible_npeel_number++; 1544 } 1545 1546 for (j = 0; j < possible_npeel_number; j++) 1547 { 1548 vect_peeling_hash_insert (&peeling_htab, loop_vinfo, 1549 dr, npeel_tmp); 1550 npeel_tmp += nelements; 1551 } 1552 1553 all_misalignments_unknown = false; 1554 /* Data-ref that was chosen for the case that all the 1555 misalignments are unknown is not relevant anymore, since we 1556 have a data-ref with known alignment. */ 1557 dr0 = NULL; 1558 } 1559 else 1560 { 1561 /* If we don't know any misalignment values, we prefer 1562 peeling for data-ref that has the maximum number of data-refs 1563 with the same alignment, unless the target prefers to align 1564 stores over load. */ 1565 if (all_misalignments_unknown) 1566 { 1567 unsigned same_align_drs 1568 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length (); 1569 if (!dr0 1570 || same_align_drs_max < same_align_drs) 1571 { 1572 same_align_drs_max = same_align_drs; 1573 dr0 = dr; 1574 } 1575 /* For data-refs with the same number of related 1576 accesses prefer the one where the misalign 1577 computation will be invariant in the outermost loop. */ 1578 else if (same_align_drs_max == same_align_drs) 1579 { 1580 struct loop *ivloop0, *ivloop; 1581 ivloop0 = outermost_invariant_loop_for_expr 1582 (loop, DR_BASE_ADDRESS (dr0)); 1583 ivloop = outermost_invariant_loop_for_expr 1584 (loop, DR_BASE_ADDRESS (dr)); 1585 if ((ivloop && !ivloop0) 1586 || (ivloop && ivloop0 1587 && flow_loop_nested_p (ivloop, ivloop0))) 1588 dr0 = dr; 1589 } 1590 1591 if (!first_store && DR_IS_WRITE (dr)) 1592 first_store = dr; 1593 } 1594 1595 /* If there are both known and unknown misaligned accesses in the 1596 loop, we choose peeling amount according to the known 1597 accesses. */ 1598 if (!supportable_dr_alignment) 1599 { 1600 dr0 = dr; 1601 if (!first_store && DR_IS_WRITE (dr)) 1602 first_store = dr; 1603 } 1604 } 1605 } 1606 else 1607 { 1608 if (!aligned_access_p (dr)) 1609 { 1610 if (dump_enabled_p ()) 1611 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1612 "vector alignment may not be reachable\n"); 1613 break; 1614 } 1615 } 1616 } 1617 1618 /* Check if we can possibly peel the loop. */ 1619 if (!vect_can_advance_ivs_p (loop_vinfo) 1620 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop)) 1621 || loop->inner) 1622 do_peeling = false; 1623 1624 if (do_peeling 1625 && all_misalignments_unknown 1626 && vect_supportable_dr_alignment (dr0, false)) 1627 { 1628 /* Check if the target requires to prefer stores over loads, i.e., if 1629 misaligned stores are more expensive than misaligned loads (taking 1630 drs with same alignment into account). */ 1631 if (first_store && DR_IS_READ (dr0)) 1632 { 1633 unsigned int load_inside_cost = 0, load_outside_cost = 0; 1634 unsigned int store_inside_cost = 0, store_outside_cost = 0; 1635 unsigned int load_inside_penalty = 0, load_outside_penalty = 0; 1636 unsigned int store_inside_penalty = 0, store_outside_penalty = 0; 1637 stmt_vector_for_cost dummy; 1638 dummy.create (2); 1639 1640 vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost, 1641 &dummy); 1642 vect_get_data_access_cost (first_store, &store_inside_cost, 1643 &store_outside_cost, &dummy); 1644 1645 dummy.release (); 1646 1647 /* Calculate the penalty for leaving FIRST_STORE unaligned (by 1648 aligning the load DR0). */ 1649 load_inside_penalty = store_inside_cost; 1650 load_outside_penalty = store_outside_cost; 1651 for (i = 0; 1652 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt ( 1653 DR_STMT (first_store))).iterate (i, &dr); 1654 i++) 1655 if (DR_IS_READ (dr)) 1656 { 1657 load_inside_penalty += load_inside_cost; 1658 load_outside_penalty += load_outside_cost; 1659 } 1660 else 1661 { 1662 load_inside_penalty += store_inside_cost; 1663 load_outside_penalty += store_outside_cost; 1664 } 1665 1666 /* Calculate the penalty for leaving DR0 unaligned (by 1667 aligning the FIRST_STORE). */ 1668 store_inside_penalty = load_inside_cost; 1669 store_outside_penalty = load_outside_cost; 1670 for (i = 0; 1671 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt ( 1672 DR_STMT (dr0))).iterate (i, &dr); 1673 i++) 1674 if (DR_IS_READ (dr)) 1675 { 1676 store_inside_penalty += load_inside_cost; 1677 store_outside_penalty += load_outside_cost; 1678 } 1679 else 1680 { 1681 store_inside_penalty += store_inside_cost; 1682 store_outside_penalty += store_outside_cost; 1683 } 1684 1685 if (load_inside_penalty > store_inside_penalty 1686 || (load_inside_penalty == store_inside_penalty 1687 && load_outside_penalty > store_outside_penalty)) 1688 dr0 = first_store; 1689 } 1690 1691 /* In case there are only loads with different unknown misalignments, use 1692 peeling only if it may help to align other accesses in the loop or 1693 if it may help improving load bandwith when we'd end up using 1694 unaligned loads. */ 1695 tree dr0_vt = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr0))); 1696 if (!first_store 1697 && !STMT_VINFO_SAME_ALIGN_REFS ( 1698 vinfo_for_stmt (DR_STMT (dr0))).length () 1699 && (vect_supportable_dr_alignment (dr0, false) 1700 != dr_unaligned_supported 1701 || (builtin_vectorization_cost (vector_load, dr0_vt, 0) 1702 == builtin_vectorization_cost (unaligned_load, dr0_vt, -1)))) 1703 do_peeling = false; 1704 } 1705 1706 if (do_peeling && !dr0) 1707 { 1708 /* Peeling is possible, but there is no data access that is not supported 1709 unless aligned. So we try to choose the best possible peeling. */ 1710 1711 /* We should get here only if there are drs with known misalignment. */ 1712 gcc_assert (!all_misalignments_unknown); 1713 1714 /* Choose the best peeling from the hash table. */ 1715 dr0 = vect_peeling_hash_choose_best_peeling (&peeling_htab, 1716 loop_vinfo, &npeel, 1717 &body_cost_vec); 1718 if (!dr0 || !npeel) 1719 do_peeling = false; 1720 } 1721 1722 if (do_peeling) 1723 { 1724 stmt = DR_STMT (dr0); 1725 stmt_info = vinfo_for_stmt (stmt); 1726 vectype = STMT_VINFO_VECTYPE (stmt_info); 1727 nelements = TYPE_VECTOR_SUBPARTS (vectype); 1728 1729 if (known_alignment_for_access_p (dr0)) 1730 { 1731 bool negative = tree_int_cst_compare (DR_STEP (dr0), 1732 size_zero_node) < 0; 1733 if (!npeel) 1734 { 1735 /* Since it's known at compile time, compute the number of 1736 iterations in the peeled loop (the peeling factor) for use in 1737 updating DR_MISALIGNMENT values. The peeling factor is the 1738 vectorization factor minus the misalignment as an element 1739 count. */ 1740 mis = DR_MISALIGNMENT (dr0); 1741 mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0)))); 1742 npeel = ((negative ? mis - nelements : nelements - mis) 1743 & (nelements - 1)); 1744 } 1745 1746 /* For interleaved data access every iteration accesses all the 1747 members of the group, therefore we divide the number of iterations 1748 by the group size. */ 1749 stmt_info = vinfo_for_stmt (DR_STMT (dr0)); 1750 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) 1751 npeel /= GROUP_SIZE (stmt_info); 1752 1753 if (dump_enabled_p ()) 1754 dump_printf_loc (MSG_NOTE, vect_location, 1755 "Try peeling by %d\n", npeel); 1756 } 1757 1758 /* Ensure that all data refs can be vectorized after the peel. */ 1759 FOR_EACH_VEC_ELT (datarefs, i, dr) 1760 { 1761 int save_misalignment; 1762 1763 if (dr == dr0) 1764 continue; 1765 1766 stmt = DR_STMT (dr); 1767 stmt_info = vinfo_for_stmt (stmt); 1768 /* For interleaving, only the alignment of the first access 1769 matters. */ 1770 if (STMT_VINFO_GROUPED_ACCESS (stmt_info) 1771 && GROUP_FIRST_ELEMENT (stmt_info) != stmt) 1772 continue; 1773 1774 /* Strided accesses perform only component accesses, alignment is 1775 irrelevant for them. */ 1776 if (STMT_VINFO_STRIDED_P (stmt_info) 1777 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)) 1778 continue; 1779 1780 save_misalignment = DR_MISALIGNMENT (dr); 1781 vect_update_misalignment_for_peel (dr, dr0, npeel); 1782 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false); 1783 SET_DR_MISALIGNMENT (dr, save_misalignment); 1784 1785 if (!supportable_dr_alignment) 1786 { 1787 do_peeling = false; 1788 break; 1789 } 1790 } 1791 1792 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0) 1793 { 1794 stat = vect_verify_datarefs_alignment (loop_vinfo); 1795 if (!stat) 1796 do_peeling = false; 1797 else 1798 { 1799 body_cost_vec.release (); 1800 return stat; 1801 } 1802 } 1803 1804 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */ 1805 if (do_peeling) 1806 { 1807 unsigned max_allowed_peel 1808 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT); 1809 if (max_allowed_peel != (unsigned)-1) 1810 { 1811 unsigned max_peel = npeel; 1812 if (max_peel == 0) 1813 { 1814 gimple *dr_stmt = DR_STMT (dr0); 1815 stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt); 1816 tree vtype = STMT_VINFO_VECTYPE (vinfo); 1817 max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1; 1818 } 1819 if (max_peel > max_allowed_peel) 1820 { 1821 do_peeling = false; 1822 if (dump_enabled_p ()) 1823 dump_printf_loc (MSG_NOTE, vect_location, 1824 "Disable peeling, max peels reached: %d\n", max_peel); 1825 } 1826 } 1827 } 1828 1829 /* Cost model #2 - if peeling may result in a remaining loop not 1830 iterating enough to be vectorized then do not peel. */ 1831 if (do_peeling 1832 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) 1833 { 1834 unsigned max_peel 1835 = npeel == 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1 : npeel; 1836 if (LOOP_VINFO_INT_NITERS (loop_vinfo) 1837 < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + max_peel) 1838 do_peeling = false; 1839 } 1840 1841 if (do_peeling) 1842 { 1843 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i. 1844 If the misalignment of DR_i is identical to that of dr0 then set 1845 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and 1846 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i) 1847 by the peeling factor times the element size of DR_i (MOD the 1848 vectorization factor times the size). Otherwise, the 1849 misalignment of DR_i must be set to unknown. */ 1850 FOR_EACH_VEC_ELT (datarefs, i, dr) 1851 if (dr != dr0) 1852 { 1853 /* Strided accesses perform only component accesses, alignment 1854 is irrelevant for them. */ 1855 stmt_info = vinfo_for_stmt (DR_STMT (dr)); 1856 if (STMT_VINFO_STRIDED_P (stmt_info) 1857 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)) 1858 continue; 1859 1860 vect_update_misalignment_for_peel (dr, dr0, npeel); 1861 } 1862 1863 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0; 1864 if (npeel) 1865 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel; 1866 else 1867 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) 1868 = DR_MISALIGNMENT (dr0); 1869 SET_DR_MISALIGNMENT (dr0, 0); 1870 if (dump_enabled_p ()) 1871 { 1872 dump_printf_loc (MSG_NOTE, vect_location, 1873 "Alignment of access forced using peeling.\n"); 1874 dump_printf_loc (MSG_NOTE, vect_location, 1875 "Peeling for alignment will be applied.\n"); 1876 } 1877 /* The inside-loop cost will be accounted for in vectorizable_load 1878 and vectorizable_store correctly with adjusted alignments. 1879 Drop the body_cst_vec on the floor here. */ 1880 body_cost_vec.release (); 1881 1882 stat = vect_verify_datarefs_alignment (loop_vinfo); 1883 gcc_assert (stat); 1884 return stat; 1885 } 1886 } 1887 1888 body_cost_vec.release (); 1889 1890 /* (2) Versioning to force alignment. */ 1891 1892 /* Try versioning if: 1893 1) optimize loop for speed 1894 2) there is at least one unsupported misaligned data ref with an unknown 1895 misalignment, and 1896 3) all misaligned data refs with a known misalignment are supported, and 1897 4) the number of runtime alignment checks is within reason. */ 1898 1899 do_versioning = 1900 optimize_loop_nest_for_speed_p (loop) 1901 && (!loop->inner); /* FORNOW */ 1902 1903 if (do_versioning) 1904 { 1905 FOR_EACH_VEC_ELT (datarefs, i, dr) 1906 { 1907 stmt = DR_STMT (dr); 1908 stmt_info = vinfo_for_stmt (stmt); 1909 1910 /* For interleaving, only the alignment of the first access 1911 matters. */ 1912 if (aligned_access_p (dr) 1913 || (STMT_VINFO_GROUPED_ACCESS (stmt_info) 1914 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)) 1915 continue; 1916 1917 if (STMT_VINFO_STRIDED_P (stmt_info)) 1918 { 1919 /* Strided loads perform only component accesses, alignment is 1920 irrelevant for them. */ 1921 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)) 1922 continue; 1923 do_versioning = false; 1924 break; 1925 } 1926 1927 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false); 1928 1929 if (!supportable_dr_alignment) 1930 { 1931 gimple *stmt; 1932 int mask; 1933 tree vectype; 1934 1935 if (known_alignment_for_access_p (dr) 1936 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length () 1937 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS)) 1938 { 1939 do_versioning = false; 1940 break; 1941 } 1942 1943 stmt = DR_STMT (dr); 1944 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)); 1945 gcc_assert (vectype); 1946 1947 /* The rightmost bits of an aligned address must be zeros. 1948 Construct the mask needed for this test. For example, 1949 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the 1950 mask must be 15 = 0xf. */ 1951 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1; 1952 1953 /* FORNOW: use the same mask to test all potentially unaligned 1954 references in the loop. The vectorizer currently supports 1955 a single vector size, see the reference to 1956 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the 1957 vectorization factor is computed. */ 1958 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo) 1959 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask); 1960 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask; 1961 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push ( 1962 DR_STMT (dr)); 1963 } 1964 } 1965 1966 /* Versioning requires at least one misaligned data reference. */ 1967 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)) 1968 do_versioning = false; 1969 else if (!do_versioning) 1970 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0); 1971 } 1972 1973 if (do_versioning) 1974 { 1975 vec<gimple *> may_misalign_stmts 1976 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo); 1977 gimple *stmt; 1978 1979 /* It can now be assumed that the data references in the statements 1980 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version 1981 of the loop being vectorized. */ 1982 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt) 1983 { 1984 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 1985 dr = STMT_VINFO_DATA_REF (stmt_info); 1986 SET_DR_MISALIGNMENT (dr, 0); 1987 if (dump_enabled_p ()) 1988 dump_printf_loc (MSG_NOTE, vect_location, 1989 "Alignment of access forced using versioning.\n"); 1990 } 1991 1992 if (dump_enabled_p ()) 1993 dump_printf_loc (MSG_NOTE, vect_location, 1994 "Versioning for alignment will be applied.\n"); 1995 1996 /* Peeling and versioning can't be done together at this time. */ 1997 gcc_assert (! (do_peeling && do_versioning)); 1998 1999 stat = vect_verify_datarefs_alignment (loop_vinfo); 2000 gcc_assert (stat); 2001 return stat; 2002 } 2003 2004 /* This point is reached if neither peeling nor versioning is being done. */ 2005 gcc_assert (! (do_peeling || do_versioning)); 2006 2007 stat = vect_verify_datarefs_alignment (loop_vinfo); 2008 return stat; 2009 } 2010 2011 2012 /* Function vect_find_same_alignment_drs. 2013 2014 Update group and alignment relations according to the chosen 2015 vectorization factor. */ 2016 2017 static void 2018 vect_find_same_alignment_drs (struct data_dependence_relation *ddr, 2019 loop_vec_info loop_vinfo) 2020 { 2021 unsigned int i; 2022 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 2023 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 2024 struct data_reference *dra = DDR_A (ddr); 2025 struct data_reference *drb = DDR_B (ddr); 2026 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra)); 2027 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb)); 2028 int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra)))); 2029 int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb)))); 2030 lambda_vector dist_v; 2031 unsigned int loop_depth; 2032 2033 if (DDR_ARE_DEPENDENT (ddr) == chrec_known) 2034 return; 2035 2036 if (dra == drb) 2037 return; 2038 2039 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know) 2040 return; 2041 2042 /* Loop-based vectorization and known data dependence. */ 2043 if (DDR_NUM_DIST_VECTS (ddr) == 0) 2044 return; 2045 2046 /* Data-dependence analysis reports a distance vector of zero 2047 for data-references that overlap only in the first iteration 2048 but have different sign step (see PR45764). 2049 So as a sanity check require equal DR_STEP. */ 2050 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0)) 2051 return; 2052 2053 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr)); 2054 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v) 2055 { 2056 int dist = dist_v[loop_depth]; 2057 2058 if (dump_enabled_p ()) 2059 dump_printf_loc (MSG_NOTE, vect_location, 2060 "dependence distance = %d.\n", dist); 2061 2062 /* Same loop iteration. */ 2063 if (dist == 0 2064 || (dist % vectorization_factor == 0 && dra_size == drb_size)) 2065 { 2066 /* Two references with distance zero have the same alignment. */ 2067 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb); 2068 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra); 2069 if (dump_enabled_p ()) 2070 { 2071 dump_printf_loc (MSG_NOTE, vect_location, 2072 "accesses have the same alignment.\n"); 2073 dump_printf (MSG_NOTE, 2074 "dependence distance modulo vf == 0 between "); 2075 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra)); 2076 dump_printf (MSG_NOTE, " and "); 2077 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb)); 2078 dump_printf (MSG_NOTE, "\n"); 2079 } 2080 } 2081 } 2082 } 2083 2084 2085 /* Function vect_analyze_data_refs_alignment 2086 2087 Analyze the alignment of the data-references in the loop. 2088 Return FALSE if a data reference is found that cannot be vectorized. */ 2089 2090 bool 2091 vect_analyze_data_refs_alignment (loop_vec_info vinfo) 2092 { 2093 if (dump_enabled_p ()) 2094 dump_printf_loc (MSG_NOTE, vect_location, 2095 "=== vect_analyze_data_refs_alignment ===\n"); 2096 2097 /* Mark groups of data references with same alignment using 2098 data dependence information. */ 2099 vec<ddr_p> ddrs = vinfo->ddrs; 2100 struct data_dependence_relation *ddr; 2101 unsigned int i; 2102 2103 FOR_EACH_VEC_ELT (ddrs, i, ddr) 2104 vect_find_same_alignment_drs (ddr, vinfo); 2105 2106 vec<data_reference_p> datarefs = vinfo->datarefs; 2107 struct data_reference *dr; 2108 2109 FOR_EACH_VEC_ELT (datarefs, i, dr) 2110 { 2111 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr)); 2112 if (STMT_VINFO_VECTORIZABLE (stmt_info) 2113 && !vect_compute_data_ref_alignment (dr)) 2114 { 2115 /* Strided accesses perform only component accesses, misalignment 2116 information is irrelevant for them. */ 2117 if (STMT_VINFO_STRIDED_P (stmt_info) 2118 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)) 2119 continue; 2120 2121 if (dump_enabled_p ()) 2122 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2123 "not vectorized: can't calculate alignment " 2124 "for data ref.\n"); 2125 2126 return false; 2127 } 2128 } 2129 2130 return true; 2131 } 2132 2133 2134 /* Analyze alignment of DRs of stmts in NODE. */ 2135 2136 static bool 2137 vect_slp_analyze_and_verify_node_alignment (slp_tree node) 2138 { 2139 /* We vectorize from the first scalar stmt in the node unless 2140 the node is permuted in which case we start from the first 2141 element in the group. */ 2142 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0]; 2143 data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt)); 2144 if (SLP_TREE_LOAD_PERMUTATION (node).exists ()) 2145 first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt)); 2146 2147 data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt)); 2148 if (! vect_compute_data_ref_alignment (dr) 2149 /* For creating the data-ref pointer we need alignment of the 2150 first element anyway. */ 2151 || (dr != first_dr 2152 && ! vect_compute_data_ref_alignment (first_dr)) 2153 || ! verify_data_ref_alignment (dr)) 2154 { 2155 if (dump_enabled_p ()) 2156 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2157 "not vectorized: bad data alignment in basic " 2158 "block.\n"); 2159 return false; 2160 } 2161 2162 return true; 2163 } 2164 2165 /* Function vect_slp_analyze_instance_alignment 2166 2167 Analyze the alignment of the data-references in the SLP instance. 2168 Return FALSE if a data reference is found that cannot be vectorized. */ 2169 2170 bool 2171 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance) 2172 { 2173 if (dump_enabled_p ()) 2174 dump_printf_loc (MSG_NOTE, vect_location, 2175 "=== vect_slp_analyze_and_verify_instance_alignment ===\n"); 2176 2177 slp_tree node; 2178 unsigned i; 2179 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node) 2180 if (! vect_slp_analyze_and_verify_node_alignment (node)) 2181 return false; 2182 2183 node = SLP_INSTANCE_TREE (instance); 2184 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0])) 2185 && ! vect_slp_analyze_and_verify_node_alignment 2186 (SLP_INSTANCE_TREE (instance))) 2187 return false; 2188 2189 return true; 2190 } 2191 2192 2193 /* Analyze groups of accesses: check that DR belongs to a group of 2194 accesses of legal size, step, etc. Detect gaps, single element 2195 interleaving, and other special cases. Set grouped access info. 2196 Collect groups of strided stores for further use in SLP analysis. 2197 Worker for vect_analyze_group_access. */ 2198 2199 static bool 2200 vect_analyze_group_access_1 (struct data_reference *dr) 2201 { 2202 tree step = DR_STEP (dr); 2203 tree scalar_type = TREE_TYPE (DR_REF (dr)); 2204 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type)); 2205 gimple *stmt = DR_STMT (dr); 2206 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 2207 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 2208 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info); 2209 HOST_WIDE_INT dr_step = -1; 2210 HOST_WIDE_INT groupsize, last_accessed_element = 1; 2211 bool slp_impossible = false; 2212 2213 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the 2214 size of the interleaving group (including gaps). */ 2215 if (tree_fits_shwi_p (step)) 2216 { 2217 dr_step = tree_to_shwi (step); 2218 /* Check that STEP is a multiple of type size. Otherwise there is 2219 a non-element-sized gap at the end of the group which we 2220 cannot represent in GROUP_GAP or GROUP_SIZE. 2221 ??? As we can handle non-constant step fine here we should 2222 simply remove uses of GROUP_GAP between the last and first 2223 element and instead rely on DR_STEP. GROUP_SIZE then would 2224 simply not include that gap. */ 2225 if ((dr_step % type_size) != 0) 2226 { 2227 if (dump_enabled_p ()) 2228 { 2229 dump_printf_loc (MSG_NOTE, vect_location, 2230 "Step "); 2231 dump_generic_expr (MSG_NOTE, TDF_SLIM, step); 2232 dump_printf (MSG_NOTE, 2233 " is not a multiple of the element size for "); 2234 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr)); 2235 dump_printf (MSG_NOTE, "\n"); 2236 } 2237 return false; 2238 } 2239 groupsize = absu_hwi (dr_step) / type_size; 2240 } 2241 else 2242 groupsize = 0; 2243 2244 /* Not consecutive access is possible only if it is a part of interleaving. */ 2245 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))) 2246 { 2247 /* Check if it this DR is a part of interleaving, and is a single 2248 element of the group that is accessed in the loop. */ 2249 2250 /* Gaps are supported only for loads. STEP must be a multiple of the type 2251 size. The size of the group must be a power of 2. */ 2252 if (DR_IS_READ (dr) 2253 && (dr_step % type_size) == 0 2254 && groupsize > 0 2255 && exact_log2 (groupsize) != -1) 2256 { 2257 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt; 2258 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize; 2259 if (dump_enabled_p ()) 2260 { 2261 dump_printf_loc (MSG_NOTE, vect_location, 2262 "Detected single element interleaving "); 2263 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr)); 2264 dump_printf (MSG_NOTE, " step "); 2265 dump_generic_expr (MSG_NOTE, TDF_SLIM, step); 2266 dump_printf (MSG_NOTE, "\n"); 2267 } 2268 2269 return true; 2270 } 2271 2272 if (dump_enabled_p ()) 2273 { 2274 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2275 "not consecutive access "); 2276 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 2277 } 2278 2279 if (bb_vinfo) 2280 { 2281 /* Mark the statement as unvectorizable. */ 2282 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false; 2283 return true; 2284 } 2285 2286 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n"); 2287 STMT_VINFO_STRIDED_P (stmt_info) = true; 2288 return true; 2289 } 2290 2291 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt) 2292 { 2293 /* First stmt in the interleaving chain. Check the chain. */ 2294 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt)); 2295 struct data_reference *data_ref = dr; 2296 unsigned int count = 1; 2297 tree prev_init = DR_INIT (data_ref); 2298 gimple *prev = stmt; 2299 HOST_WIDE_INT diff, gaps = 0; 2300 2301 while (next) 2302 { 2303 /* Skip same data-refs. In case that two or more stmts share 2304 data-ref (supported only for loads), we vectorize only the first 2305 stmt, and the rest get their vectorized loads from the first 2306 one. */ 2307 if (!tree_int_cst_compare (DR_INIT (data_ref), 2308 DR_INIT (STMT_VINFO_DATA_REF ( 2309 vinfo_for_stmt (next))))) 2310 { 2311 if (DR_IS_WRITE (data_ref)) 2312 { 2313 if (dump_enabled_p ()) 2314 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2315 "Two store stmts share the same dr.\n"); 2316 return false; 2317 } 2318 2319 if (dump_enabled_p ()) 2320 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2321 "Two or more load stmts share the same dr.\n"); 2322 2323 /* For load use the same data-ref load. */ 2324 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev; 2325 2326 prev = next; 2327 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next)); 2328 continue; 2329 } 2330 2331 prev = next; 2332 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next)); 2333 2334 /* All group members have the same STEP by construction. */ 2335 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0)); 2336 2337 /* Check that the distance between two accesses is equal to the type 2338 size. Otherwise, we have gaps. */ 2339 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref)) 2340 - TREE_INT_CST_LOW (prev_init)) / type_size; 2341 if (diff != 1) 2342 { 2343 /* FORNOW: SLP of accesses with gaps is not supported. */ 2344 slp_impossible = true; 2345 if (DR_IS_WRITE (data_ref)) 2346 { 2347 if (dump_enabled_p ()) 2348 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2349 "interleaved store with gaps\n"); 2350 return false; 2351 } 2352 2353 gaps += diff - 1; 2354 } 2355 2356 last_accessed_element += diff; 2357 2358 /* Store the gap from the previous member of the group. If there is no 2359 gap in the access, GROUP_GAP is always 1. */ 2360 GROUP_GAP (vinfo_for_stmt (next)) = diff; 2361 2362 prev_init = DR_INIT (data_ref); 2363 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next)); 2364 /* Count the number of data-refs in the chain. */ 2365 count++; 2366 } 2367 2368 if (groupsize == 0) 2369 groupsize = count + gaps; 2370 2371 if (groupsize > UINT_MAX) 2372 { 2373 if (dump_enabled_p ()) 2374 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2375 "group is too large\n"); 2376 return false; 2377 } 2378 2379 /* Check that the size of the interleaving is equal to count for stores, 2380 i.e., that there are no gaps. */ 2381 if (groupsize != count 2382 && !DR_IS_READ (dr)) 2383 { 2384 if (dump_enabled_p ()) 2385 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2386 "interleaved store with gaps\n"); 2387 return false; 2388 } 2389 2390 /* If there is a gap after the last load in the group it is the 2391 difference between the groupsize and the last accessed 2392 element. 2393 When there is no gap, this difference should be 0. */ 2394 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element; 2395 2396 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize; 2397 if (dump_enabled_p ()) 2398 { 2399 dump_printf_loc (MSG_NOTE, vect_location, 2400 "Detected interleaving "); 2401 if (DR_IS_READ (dr)) 2402 dump_printf (MSG_NOTE, "load "); 2403 else 2404 dump_printf (MSG_NOTE, "store "); 2405 dump_printf (MSG_NOTE, "of size %u starting with ", 2406 (unsigned)groupsize); 2407 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0); 2408 if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0) 2409 dump_printf_loc (MSG_NOTE, vect_location, 2410 "There is a gap of %u elements after the group\n", 2411 GROUP_GAP (vinfo_for_stmt (stmt))); 2412 } 2413 2414 /* SLP: create an SLP data structure for every interleaving group of 2415 stores for further analysis in vect_analyse_slp. */ 2416 if (DR_IS_WRITE (dr) && !slp_impossible) 2417 { 2418 if (loop_vinfo) 2419 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt); 2420 if (bb_vinfo) 2421 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt); 2422 } 2423 } 2424 2425 return true; 2426 } 2427 2428 /* Analyze groups of accesses: check that DR belongs to a group of 2429 accesses of legal size, step, etc. Detect gaps, single element 2430 interleaving, and other special cases. Set grouped access info. 2431 Collect groups of strided stores for further use in SLP analysis. */ 2432 2433 static bool 2434 vect_analyze_group_access (struct data_reference *dr) 2435 { 2436 if (!vect_analyze_group_access_1 (dr)) 2437 { 2438 /* Dissolve the group if present. */ 2439 gimple *next; 2440 gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr))); 2441 while (stmt) 2442 { 2443 stmt_vec_info vinfo = vinfo_for_stmt (stmt); 2444 next = GROUP_NEXT_ELEMENT (vinfo); 2445 GROUP_FIRST_ELEMENT (vinfo) = NULL; 2446 GROUP_NEXT_ELEMENT (vinfo) = NULL; 2447 stmt = next; 2448 } 2449 return false; 2450 } 2451 return true; 2452 } 2453 2454 /* Analyze the access pattern of the data-reference DR. 2455 In case of non-consecutive accesses call vect_analyze_group_access() to 2456 analyze groups of accesses. */ 2457 2458 static bool 2459 vect_analyze_data_ref_access (struct data_reference *dr) 2460 { 2461 tree step = DR_STEP (dr); 2462 tree scalar_type = TREE_TYPE (DR_REF (dr)); 2463 gimple *stmt = DR_STMT (dr); 2464 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 2465 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 2466 struct loop *loop = NULL; 2467 2468 if (loop_vinfo) 2469 loop = LOOP_VINFO_LOOP (loop_vinfo); 2470 2471 if (loop_vinfo && !step) 2472 { 2473 if (dump_enabled_p ()) 2474 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2475 "bad data-ref access in loop\n"); 2476 return false; 2477 } 2478 2479 /* Allow loads with zero step in inner-loop vectorization. */ 2480 if (loop_vinfo && integer_zerop (step)) 2481 { 2482 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL; 2483 if (!nested_in_vect_loop_p (loop, stmt)) 2484 return DR_IS_READ (dr); 2485 /* Allow references with zero step for outer loops marked 2486 with pragma omp simd only - it guarantees absence of 2487 loop-carried dependencies between inner loop iterations. */ 2488 if (!loop->force_vectorize) 2489 { 2490 if (dump_enabled_p ()) 2491 dump_printf_loc (MSG_NOTE, vect_location, 2492 "zero step in inner loop of nest\n"); 2493 return false; 2494 } 2495 } 2496 2497 if (loop && nested_in_vect_loop_p (loop, stmt)) 2498 { 2499 /* Interleaved accesses are not yet supported within outer-loop 2500 vectorization for references in the inner-loop. */ 2501 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL; 2502 2503 /* For the rest of the analysis we use the outer-loop step. */ 2504 step = STMT_VINFO_DR_STEP (stmt_info); 2505 if (integer_zerop (step)) 2506 { 2507 if (dump_enabled_p ()) 2508 dump_printf_loc (MSG_NOTE, vect_location, 2509 "zero step in outer loop.\n"); 2510 return DR_IS_READ (dr); 2511 } 2512 } 2513 2514 /* Consecutive? */ 2515 if (TREE_CODE (step) == INTEGER_CST) 2516 { 2517 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step); 2518 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type)) 2519 || (dr_step < 0 2520 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step))) 2521 { 2522 /* Mark that it is not interleaving. */ 2523 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL; 2524 return true; 2525 } 2526 } 2527 2528 if (loop && nested_in_vect_loop_p (loop, stmt)) 2529 { 2530 if (dump_enabled_p ()) 2531 dump_printf_loc (MSG_NOTE, vect_location, 2532 "grouped access in outer loop.\n"); 2533 return false; 2534 } 2535 2536 2537 /* Assume this is a DR handled by non-constant strided load case. */ 2538 if (TREE_CODE (step) != INTEGER_CST) 2539 return (STMT_VINFO_STRIDED_P (stmt_info) 2540 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info) 2541 || vect_analyze_group_access (dr))); 2542 2543 /* Not consecutive access - check if it's a part of interleaving group. */ 2544 return vect_analyze_group_access (dr); 2545 } 2546 2547 2548 2549 /* A helper function used in the comparator function to sort data 2550 references. T1 and T2 are two data references to be compared. 2551 The function returns -1, 0, or 1. */ 2552 2553 static int 2554 compare_tree (tree t1, tree t2) 2555 { 2556 int i, cmp; 2557 enum tree_code code; 2558 char tclass; 2559 2560 if (t1 == t2) 2561 return 0; 2562 if (t1 == NULL) 2563 return -1; 2564 if (t2 == NULL) 2565 return 1; 2566 2567 STRIP_NOPS (t1); 2568 STRIP_NOPS (t2); 2569 2570 if (TREE_CODE (t1) != TREE_CODE (t2)) 2571 return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1; 2572 2573 code = TREE_CODE (t1); 2574 switch (code) 2575 { 2576 /* For const values, we can just use hash values for comparisons. */ 2577 case INTEGER_CST: 2578 case REAL_CST: 2579 case FIXED_CST: 2580 case STRING_CST: 2581 case COMPLEX_CST: 2582 case VECTOR_CST: 2583 { 2584 hashval_t h1 = iterative_hash_expr (t1, 0); 2585 hashval_t h2 = iterative_hash_expr (t2, 0); 2586 if (h1 != h2) 2587 return h1 < h2 ? -1 : 1; 2588 break; 2589 } 2590 2591 case SSA_NAME: 2592 cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2)); 2593 if (cmp != 0) 2594 return cmp; 2595 2596 if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2)) 2597 return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1; 2598 break; 2599 2600 default: 2601 tclass = TREE_CODE_CLASS (code); 2602 2603 /* For var-decl, we could compare their UIDs. */ 2604 if (tclass == tcc_declaration) 2605 { 2606 if (DECL_UID (t1) != DECL_UID (t2)) 2607 return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1; 2608 break; 2609 } 2610 2611 /* For expressions with operands, compare their operands recursively. */ 2612 for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i) 2613 { 2614 cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i)); 2615 if (cmp != 0) 2616 return cmp; 2617 } 2618 } 2619 2620 return 0; 2621 } 2622 2623 2624 /* Compare two data-references DRA and DRB to group them into chunks 2625 suitable for grouping. */ 2626 2627 static int 2628 dr_group_sort_cmp (const void *dra_, const void *drb_) 2629 { 2630 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_); 2631 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_); 2632 int cmp; 2633 2634 /* Stabilize sort. */ 2635 if (dra == drb) 2636 return 0; 2637 2638 /* DRs in different loops never belong to the same group. */ 2639 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father; 2640 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father; 2641 if (loopa != loopb) 2642 return loopa->num < loopb->num ? -1 : 1; 2643 2644 /* Ordering of DRs according to base. */ 2645 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)) 2646 { 2647 cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb)); 2648 if (cmp != 0) 2649 return cmp; 2650 } 2651 2652 /* And according to DR_OFFSET. */ 2653 if (!dr_equal_offsets_p (dra, drb)) 2654 { 2655 cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)); 2656 if (cmp != 0) 2657 return cmp; 2658 } 2659 2660 /* Put reads before writes. */ 2661 if (DR_IS_READ (dra) != DR_IS_READ (drb)) 2662 return DR_IS_READ (dra) ? -1 : 1; 2663 2664 /* Then sort after access size. */ 2665 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))), 2666 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0)) 2667 { 2668 cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))), 2669 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)))); 2670 if (cmp != 0) 2671 return cmp; 2672 } 2673 2674 /* And after step. */ 2675 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0)) 2676 { 2677 cmp = compare_tree (DR_STEP (dra), DR_STEP (drb)); 2678 if (cmp != 0) 2679 return cmp; 2680 } 2681 2682 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */ 2683 cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)); 2684 if (cmp == 0) 2685 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1; 2686 return cmp; 2687 } 2688 2689 /* Function vect_analyze_data_ref_accesses. 2690 2691 Analyze the access pattern of all the data references in the loop. 2692 2693 FORNOW: the only access pattern that is considered vectorizable is a 2694 simple step 1 (consecutive) access. 2695 2696 FORNOW: handle only arrays and pointer accesses. */ 2697 2698 bool 2699 vect_analyze_data_ref_accesses (vec_info *vinfo) 2700 { 2701 unsigned int i; 2702 vec<data_reference_p> datarefs = vinfo->datarefs; 2703 struct data_reference *dr; 2704 2705 if (dump_enabled_p ()) 2706 dump_printf_loc (MSG_NOTE, vect_location, 2707 "=== vect_analyze_data_ref_accesses ===\n"); 2708 2709 if (datarefs.is_empty ()) 2710 return true; 2711 2712 /* Sort the array of datarefs to make building the interleaving chains 2713 linear. Don't modify the original vector's order, it is needed for 2714 determining what dependencies are reversed. */ 2715 vec<data_reference_p> datarefs_copy = datarefs.copy (); 2716 datarefs_copy.qsort (dr_group_sort_cmp); 2717 2718 /* Build the interleaving chains. */ 2719 for (i = 0; i < datarefs_copy.length () - 1;) 2720 { 2721 data_reference_p dra = datarefs_copy[i]; 2722 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra)); 2723 stmt_vec_info lastinfo = NULL; 2724 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_a)) 2725 { 2726 ++i; 2727 continue; 2728 } 2729 for (i = i + 1; i < datarefs_copy.length (); ++i) 2730 { 2731 data_reference_p drb = datarefs_copy[i]; 2732 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb)); 2733 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_b)) 2734 break; 2735 2736 /* ??? Imperfect sorting (non-compatible types, non-modulo 2737 accesses, same accesses) can lead to a group to be artificially 2738 split here as we don't just skip over those. If it really 2739 matters we can push those to a worklist and re-iterate 2740 over them. The we can just skip ahead to the next DR here. */ 2741 2742 /* DRs in a different loop should not be put into the same 2743 interleaving group. */ 2744 if (gimple_bb (DR_STMT (dra))->loop_father 2745 != gimple_bb (DR_STMT (drb))->loop_father) 2746 break; 2747 2748 /* Check that the data-refs have same first location (except init) 2749 and they are both either store or load (not load and store, 2750 not masked loads or stores). */ 2751 if (DR_IS_READ (dra) != DR_IS_READ (drb) 2752 || !operand_equal_p (DR_BASE_ADDRESS (dra), 2753 DR_BASE_ADDRESS (drb), 0) 2754 || !dr_equal_offsets_p (dra, drb) 2755 || !gimple_assign_single_p (DR_STMT (dra)) 2756 || !gimple_assign_single_p (DR_STMT (drb))) 2757 break; 2758 2759 /* Check that the data-refs have the same constant size. */ 2760 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))); 2761 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))); 2762 if (!tree_fits_uhwi_p (sza) 2763 || !tree_fits_uhwi_p (szb) 2764 || !tree_int_cst_equal (sza, szb)) 2765 break; 2766 2767 /* Check that the data-refs have the same step. */ 2768 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0)) 2769 break; 2770 2771 /* Do not place the same access in the interleaving chain twice. */ 2772 if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0) 2773 break; 2774 2775 /* Check the types are compatible. 2776 ??? We don't distinguish this during sorting. */ 2777 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)), 2778 TREE_TYPE (DR_REF (drb)))) 2779 break; 2780 2781 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */ 2782 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra)); 2783 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb)); 2784 gcc_assert (init_a <= init_b); 2785 2786 /* If init_b == init_a + the size of the type * k, we have an 2787 interleaving, and DRA is accessed before DRB. */ 2788 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza); 2789 if (type_size_a == 0 2790 || (init_b - init_a) % type_size_a != 0) 2791 break; 2792 2793 /* If we have a store, the accesses are adjacent. This splits 2794 groups into chunks we support (we don't support vectorization 2795 of stores with gaps). */ 2796 if (!DR_IS_READ (dra) 2797 && (init_b - (HOST_WIDE_INT) TREE_INT_CST_LOW 2798 (DR_INIT (datarefs_copy[i-1])) 2799 != type_size_a)) 2800 break; 2801 2802 /* If the step (if not zero or non-constant) is greater than the 2803 difference between data-refs' inits this splits groups into 2804 suitable sizes. */ 2805 if (tree_fits_shwi_p (DR_STEP (dra))) 2806 { 2807 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra)); 2808 if (step != 0 && step <= (init_b - init_a)) 2809 break; 2810 } 2811 2812 if (dump_enabled_p ()) 2813 { 2814 dump_printf_loc (MSG_NOTE, vect_location, 2815 "Detected interleaving "); 2816 if (DR_IS_READ (dra)) 2817 dump_printf (MSG_NOTE, "load "); 2818 else 2819 dump_printf (MSG_NOTE, "store "); 2820 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra)); 2821 dump_printf (MSG_NOTE, " and "); 2822 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb)); 2823 dump_printf (MSG_NOTE, "\n"); 2824 } 2825 2826 /* Link the found element into the group list. */ 2827 if (!GROUP_FIRST_ELEMENT (stmtinfo_a)) 2828 { 2829 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra); 2830 lastinfo = stmtinfo_a; 2831 } 2832 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra); 2833 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb); 2834 lastinfo = stmtinfo_b; 2835 } 2836 } 2837 2838 FOR_EACH_VEC_ELT (datarefs_copy, i, dr) 2839 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) 2840 && !vect_analyze_data_ref_access (dr)) 2841 { 2842 if (dump_enabled_p ()) 2843 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2844 "not vectorized: complicated access pattern.\n"); 2845 2846 if (is_a <bb_vec_info> (vinfo)) 2847 { 2848 /* Mark the statement as not vectorizable. */ 2849 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false; 2850 continue; 2851 } 2852 else 2853 { 2854 datarefs_copy.release (); 2855 return false; 2856 } 2857 } 2858 2859 datarefs_copy.release (); 2860 return true; 2861 } 2862 2863 2864 /* Operator == between two dr_with_seg_len objects. 2865 2866 This equality operator is used to make sure two data refs 2867 are the same one so that we will consider to combine the 2868 aliasing checks of those two pairs of data dependent data 2869 refs. */ 2870 2871 static bool 2872 operator == (const dr_with_seg_len& d1, 2873 const dr_with_seg_len& d2) 2874 { 2875 return operand_equal_p (DR_BASE_ADDRESS (d1.dr), 2876 DR_BASE_ADDRESS (d2.dr), 0) 2877 && compare_tree (d1.offset, d2.offset) == 0 2878 && compare_tree (d1.seg_len, d2.seg_len) == 0; 2879 } 2880 2881 /* Function comp_dr_with_seg_len_pair. 2882 2883 Comparison function for sorting objects of dr_with_seg_len_pair_t 2884 so that we can combine aliasing checks in one scan. */ 2885 2886 static int 2887 comp_dr_with_seg_len_pair (const void *p1_, const void *p2_) 2888 { 2889 const dr_with_seg_len_pair_t* p1 = (const dr_with_seg_len_pair_t *) p1_; 2890 const dr_with_seg_len_pair_t* p2 = (const dr_with_seg_len_pair_t *) p2_; 2891 2892 const dr_with_seg_len &p11 = p1->first, 2893 &p12 = p1->second, 2894 &p21 = p2->first, 2895 &p22 = p2->second; 2896 2897 /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks 2898 if a and c have the same basic address snd step, and b and d have the same 2899 address and step. Therefore, if any a&c or b&d don't have the same address 2900 and step, we don't care the order of those two pairs after sorting. */ 2901 int comp_res; 2902 2903 if ((comp_res = compare_tree (DR_BASE_ADDRESS (p11.dr), 2904 DR_BASE_ADDRESS (p21.dr))) != 0) 2905 return comp_res; 2906 if ((comp_res = compare_tree (DR_BASE_ADDRESS (p12.dr), 2907 DR_BASE_ADDRESS (p22.dr))) != 0) 2908 return comp_res; 2909 if ((comp_res = compare_tree (DR_STEP (p11.dr), DR_STEP (p21.dr))) != 0) 2910 return comp_res; 2911 if ((comp_res = compare_tree (DR_STEP (p12.dr), DR_STEP (p22.dr))) != 0) 2912 return comp_res; 2913 if ((comp_res = compare_tree (p11.offset, p21.offset)) != 0) 2914 return comp_res; 2915 if ((comp_res = compare_tree (p12.offset, p22.offset)) != 0) 2916 return comp_res; 2917 2918 return 0; 2919 } 2920 2921 /* Function vect_vfa_segment_size. 2922 2923 Create an expression that computes the size of segment 2924 that will be accessed for a data reference. The functions takes into 2925 account that realignment loads may access one more vector. 2926 2927 Input: 2928 DR: The data reference. 2929 LENGTH_FACTOR: segment length to consider. 2930 2931 Return an expression whose value is the size of segment which will be 2932 accessed by DR. */ 2933 2934 static tree 2935 vect_vfa_segment_size (struct data_reference *dr, tree length_factor) 2936 { 2937 tree segment_length; 2938 2939 if (integer_zerop (DR_STEP (dr))) 2940 segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr))); 2941 else 2942 segment_length = size_binop (MULT_EXPR, 2943 fold_convert (sizetype, DR_STEP (dr)), 2944 fold_convert (sizetype, length_factor)); 2945 2946 if (vect_supportable_dr_alignment (dr, false) 2947 == dr_explicit_realign_optimized) 2948 { 2949 tree vector_size = TYPE_SIZE_UNIT 2950 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)))); 2951 2952 segment_length = size_binop (PLUS_EXPR, segment_length, vector_size); 2953 } 2954 return segment_length; 2955 } 2956 2957 /* Function vect_prune_runtime_alias_test_list. 2958 2959 Prune a list of ddrs to be tested at run-time by versioning for alias. 2960 Merge several alias checks into one if possible. 2961 Return FALSE if resulting list of ddrs is longer then allowed by 2962 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */ 2963 2964 bool 2965 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo) 2966 { 2967 vec<ddr_p> may_alias_ddrs = 2968 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo); 2969 vec<dr_with_seg_len_pair_t>& comp_alias_ddrs = 2970 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo); 2971 int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 2972 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo); 2973 2974 ddr_p ddr; 2975 unsigned int i; 2976 tree length_factor; 2977 2978 if (dump_enabled_p ()) 2979 dump_printf_loc (MSG_NOTE, vect_location, 2980 "=== vect_prune_runtime_alias_test_list ===\n"); 2981 2982 if (may_alias_ddrs.is_empty ()) 2983 return true; 2984 2985 /* Basically, for each pair of dependent data refs store_ptr_0 2986 and load_ptr_0, we create an expression: 2987 2988 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0) 2989 || (load_ptr_0 + load_segment_length_0) <= store_ptr_0)) 2990 2991 for aliasing checks. However, in some cases we can decrease 2992 the number of checks by combining two checks into one. For 2993 example, suppose we have another pair of data refs store_ptr_0 2994 and load_ptr_1, and if the following condition is satisfied: 2995 2996 load_ptr_0 < load_ptr_1 && 2997 load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0 2998 2999 (this condition means, in each iteration of vectorized loop, 3000 the accessed memory of store_ptr_0 cannot be between the memory 3001 of load_ptr_0 and load_ptr_1.) 3002 3003 we then can use only the following expression to finish the 3004 alising checks between store_ptr_0 & load_ptr_0 and 3005 store_ptr_0 & load_ptr_1: 3006 3007 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0) 3008 || (load_ptr_1 + load_segment_length_1 <= store_ptr_0)) 3009 3010 Note that we only consider that load_ptr_0 and load_ptr_1 have the 3011 same basic address. */ 3012 3013 comp_alias_ddrs.create (may_alias_ddrs.length ()); 3014 3015 /* First, we collect all data ref pairs for aliasing checks. */ 3016 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr) 3017 { 3018 struct data_reference *dr_a, *dr_b; 3019 gimple *dr_group_first_a, *dr_group_first_b; 3020 tree segment_length_a, segment_length_b; 3021 gimple *stmt_a, *stmt_b; 3022 3023 dr_a = DDR_A (ddr); 3024 stmt_a = DR_STMT (DDR_A (ddr)); 3025 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a)); 3026 if (dr_group_first_a) 3027 { 3028 stmt_a = dr_group_first_a; 3029 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a)); 3030 } 3031 3032 dr_b = DDR_B (ddr); 3033 stmt_b = DR_STMT (DDR_B (ddr)); 3034 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b)); 3035 if (dr_group_first_b) 3036 { 3037 stmt_b = dr_group_first_b; 3038 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b)); 3039 } 3040 3041 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)) 3042 length_factor = scalar_loop_iters; 3043 else 3044 length_factor = size_int (vect_factor); 3045 segment_length_a = vect_vfa_segment_size (dr_a, length_factor); 3046 segment_length_b = vect_vfa_segment_size (dr_b, length_factor); 3047 3048 dr_with_seg_len_pair_t dr_with_seg_len_pair 3049 (dr_with_seg_len (dr_a, segment_length_a), 3050 dr_with_seg_len (dr_b, segment_length_b)); 3051 3052 if (compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b)) > 0) 3053 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second); 3054 3055 comp_alias_ddrs.safe_push (dr_with_seg_len_pair); 3056 } 3057 3058 /* Second, we sort the collected data ref pairs so that we can scan 3059 them once to combine all possible aliasing checks. */ 3060 comp_alias_ddrs.qsort (comp_dr_with_seg_len_pair); 3061 3062 /* Third, we scan the sorted dr pairs and check if we can combine 3063 alias checks of two neighboring dr pairs. */ 3064 for (size_t i = 1; i < comp_alias_ddrs.length (); ++i) 3065 { 3066 /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2). */ 3067 dr_with_seg_len *dr_a1 = &comp_alias_ddrs[i-1].first, 3068 *dr_b1 = &comp_alias_ddrs[i-1].second, 3069 *dr_a2 = &comp_alias_ddrs[i].first, 3070 *dr_b2 = &comp_alias_ddrs[i].second; 3071 3072 /* Remove duplicate data ref pairs. */ 3073 if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2) 3074 { 3075 if (dump_enabled_p ()) 3076 { 3077 dump_printf_loc (MSG_NOTE, vect_location, 3078 "found equal ranges "); 3079 dump_generic_expr (MSG_NOTE, TDF_SLIM, 3080 DR_REF (dr_a1->dr)); 3081 dump_printf (MSG_NOTE, ", "); 3082 dump_generic_expr (MSG_NOTE, TDF_SLIM, 3083 DR_REF (dr_b1->dr)); 3084 dump_printf (MSG_NOTE, " and "); 3085 dump_generic_expr (MSG_NOTE, TDF_SLIM, 3086 DR_REF (dr_a2->dr)); 3087 dump_printf (MSG_NOTE, ", "); 3088 dump_generic_expr (MSG_NOTE, TDF_SLIM, 3089 DR_REF (dr_b2->dr)); 3090 dump_printf (MSG_NOTE, "\n"); 3091 } 3092 3093 comp_alias_ddrs.ordered_remove (i--); 3094 continue; 3095 } 3096 3097 if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2) 3098 { 3099 /* We consider the case that DR_B1 and DR_B2 are same memrefs, 3100 and DR_A1 and DR_A2 are two consecutive memrefs. */ 3101 if (*dr_a1 == *dr_a2) 3102 { 3103 std::swap (dr_a1, dr_b1); 3104 std::swap (dr_a2, dr_b2); 3105 } 3106 3107 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr), 3108 DR_BASE_ADDRESS (dr_a2->dr), 3109 0) 3110 || !tree_fits_shwi_p (dr_a1->offset) 3111 || !tree_fits_shwi_p (dr_a2->offset)) 3112 continue; 3113 3114 /* Make sure dr_a1 starts left of dr_a2. */ 3115 if (tree_int_cst_lt (dr_a2->offset, dr_a1->offset)) 3116 std::swap (*dr_a1, *dr_a2); 3117 3118 unsigned HOST_WIDE_INT diff 3119 = tree_to_shwi (dr_a2->offset) - tree_to_shwi (dr_a1->offset); 3120 3121 3122 bool do_remove = false; 3123 3124 /* If the left segment does not extend beyond the start of the 3125 right segment the new segment length is that of the right 3126 plus the segment distance. */ 3127 if (tree_fits_uhwi_p (dr_a1->seg_len) 3128 && compare_tree_int (dr_a1->seg_len, diff) <= 0) 3129 { 3130 dr_a1->seg_len = size_binop (PLUS_EXPR, dr_a2->seg_len, 3131 size_int (diff)); 3132 do_remove = true; 3133 } 3134 /* Generally the new segment length is the maximum of the 3135 left segment size and the right segment size plus the distance. 3136 ??? We can also build tree MAX_EXPR here but it's not clear this 3137 is profitable. */ 3138 else if (tree_fits_uhwi_p (dr_a1->seg_len) 3139 && tree_fits_uhwi_p (dr_a2->seg_len)) 3140 { 3141 unsigned HOST_WIDE_INT seg_len_a1 = tree_to_uhwi (dr_a1->seg_len); 3142 unsigned HOST_WIDE_INT seg_len_a2 = tree_to_uhwi (dr_a2->seg_len); 3143 dr_a1->seg_len = size_int (MAX (seg_len_a1, diff + seg_len_a2)); 3144 do_remove = true; 3145 } 3146 /* Now we check if the following condition is satisfied: 3147 3148 DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B 3149 3150 where DIFF = DR_A2->OFFSET - DR_A1->OFFSET. However, 3151 SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we 3152 have to make a best estimation. We can get the minimum value 3153 of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B, 3154 then either of the following two conditions can guarantee the 3155 one above: 3156 3157 1: DIFF <= MIN_SEG_LEN_B 3158 2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B */ 3159 else 3160 { 3161 unsigned HOST_WIDE_INT min_seg_len_b 3162 = (tree_fits_uhwi_p (dr_b1->seg_len) 3163 ? tree_to_uhwi (dr_b1->seg_len) 3164 : vect_factor); 3165 3166 if (diff <= min_seg_len_b 3167 || (tree_fits_uhwi_p (dr_a1->seg_len) 3168 && diff - tree_to_uhwi (dr_a1->seg_len) < min_seg_len_b)) 3169 { 3170 dr_a1->seg_len = size_binop (PLUS_EXPR, 3171 dr_a2->seg_len, size_int (diff)); 3172 do_remove = true; 3173 } 3174 } 3175 3176 if (do_remove) 3177 { 3178 if (dump_enabled_p ()) 3179 { 3180 dump_printf_loc (MSG_NOTE, vect_location, 3181 "merging ranges for "); 3182 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a1->dr)); 3183 dump_printf (MSG_NOTE, ", "); 3184 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b1->dr)); 3185 dump_printf (MSG_NOTE, " and "); 3186 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a2->dr)); 3187 dump_printf (MSG_NOTE, ", "); 3188 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b2->dr)); 3189 dump_printf (MSG_NOTE, "\n"); 3190 } 3191 comp_alias_ddrs.ordered_remove (i--); 3192 } 3193 } 3194 } 3195 3196 dump_printf_loc (MSG_NOTE, vect_location, 3197 "improved number of alias checks from %d to %d\n", 3198 may_alias_ddrs.length (), comp_alias_ddrs.length ()); 3199 if ((int) comp_alias_ddrs.length () > 3200 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS)) 3201 return false; 3202 3203 return true; 3204 } 3205 3206 /* Check whether a non-affine read or write in stmt is suitable for gather load 3207 or scatter store and if so, return a builtin decl for that operation. */ 3208 3209 tree 3210 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo, tree *basep, 3211 tree *offp, int *scalep) 3212 { 3213 HOST_WIDE_INT scale = 1, pbitpos, pbitsize; 3214 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 3215 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 3216 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); 3217 tree offtype = NULL_TREE; 3218 tree decl, base, off; 3219 machine_mode pmode; 3220 int punsignedp, reversep, pvolatilep = 0; 3221 3222 base = DR_REF (dr); 3223 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF, 3224 see if we can use the def stmt of the address. */ 3225 if (is_gimple_call (stmt) 3226 && gimple_call_internal_p (stmt) 3227 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD 3228 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE) 3229 && TREE_CODE (base) == MEM_REF 3230 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME 3231 && integer_zerop (TREE_OPERAND (base, 1)) 3232 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0))) 3233 { 3234 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0)); 3235 if (is_gimple_assign (def_stmt) 3236 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR) 3237 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0); 3238 } 3239 3240 /* The gather and scatter builtins need address of the form 3241 loop_invariant + vector * {1, 2, 4, 8} 3242 or 3243 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }. 3244 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture 3245 of loop invariants/SSA_NAMEs defined in the loop, with casts, 3246 multiplications and additions in it. To get a vector, we need 3247 a single SSA_NAME that will be defined in the loop and will 3248 contain everything that is not loop invariant and that can be 3249 vectorized. The following code attempts to find such a preexistng 3250 SSA_NAME OFF and put the loop invariants into a tree BASE 3251 that can be gimplified before the loop. */ 3252 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode, 3253 &punsignedp, &reversep, &pvolatilep, false); 3254 gcc_assert (base && (pbitpos % BITS_PER_UNIT) == 0 && !reversep); 3255 3256 if (TREE_CODE (base) == MEM_REF) 3257 { 3258 if (!integer_zerop (TREE_OPERAND (base, 1))) 3259 { 3260 if (off == NULL_TREE) 3261 { 3262 offset_int moff = mem_ref_offset (base); 3263 off = wide_int_to_tree (sizetype, moff); 3264 } 3265 else 3266 off = size_binop (PLUS_EXPR, off, 3267 fold_convert (sizetype, TREE_OPERAND (base, 1))); 3268 } 3269 base = TREE_OPERAND (base, 0); 3270 } 3271 else 3272 base = build_fold_addr_expr (base); 3273 3274 if (off == NULL_TREE) 3275 off = size_zero_node; 3276 3277 /* If base is not loop invariant, either off is 0, then we start with just 3278 the constant offset in the loop invariant BASE and continue with base 3279 as OFF, otherwise give up. 3280 We could handle that case by gimplifying the addition of base + off 3281 into some SSA_NAME and use that as off, but for now punt. */ 3282 if (!expr_invariant_in_loop_p (loop, base)) 3283 { 3284 if (!integer_zerop (off)) 3285 return NULL_TREE; 3286 off = base; 3287 base = size_int (pbitpos / BITS_PER_UNIT); 3288 } 3289 /* Otherwise put base + constant offset into the loop invariant BASE 3290 and continue with OFF. */ 3291 else 3292 { 3293 base = fold_convert (sizetype, base); 3294 base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT)); 3295 } 3296 3297 /* OFF at this point may be either a SSA_NAME or some tree expression 3298 from get_inner_reference. Try to peel off loop invariants from it 3299 into BASE as long as possible. */ 3300 STRIP_NOPS (off); 3301 while (offtype == NULL_TREE) 3302 { 3303 enum tree_code code; 3304 tree op0, op1, add = NULL_TREE; 3305 3306 if (TREE_CODE (off) == SSA_NAME) 3307 { 3308 gimple *def_stmt = SSA_NAME_DEF_STMT (off); 3309 3310 if (expr_invariant_in_loop_p (loop, off)) 3311 return NULL_TREE; 3312 3313 if (gimple_code (def_stmt) != GIMPLE_ASSIGN) 3314 break; 3315 3316 op0 = gimple_assign_rhs1 (def_stmt); 3317 code = gimple_assign_rhs_code (def_stmt); 3318 op1 = gimple_assign_rhs2 (def_stmt); 3319 } 3320 else 3321 { 3322 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS) 3323 return NULL_TREE; 3324 code = TREE_CODE (off); 3325 extract_ops_from_tree (off, &code, &op0, &op1); 3326 } 3327 switch (code) 3328 { 3329 case POINTER_PLUS_EXPR: 3330 case PLUS_EXPR: 3331 if (expr_invariant_in_loop_p (loop, op0)) 3332 { 3333 add = op0; 3334 off = op1; 3335 do_add: 3336 add = fold_convert (sizetype, add); 3337 if (scale != 1) 3338 add = size_binop (MULT_EXPR, add, size_int (scale)); 3339 base = size_binop (PLUS_EXPR, base, add); 3340 continue; 3341 } 3342 if (expr_invariant_in_loop_p (loop, op1)) 3343 { 3344 add = op1; 3345 off = op0; 3346 goto do_add; 3347 } 3348 break; 3349 case MINUS_EXPR: 3350 if (expr_invariant_in_loop_p (loop, op1)) 3351 { 3352 add = fold_convert (sizetype, op1); 3353 add = size_binop (MINUS_EXPR, size_zero_node, add); 3354 off = op0; 3355 goto do_add; 3356 } 3357 break; 3358 case MULT_EXPR: 3359 if (scale == 1 && tree_fits_shwi_p (op1)) 3360 { 3361 scale = tree_to_shwi (op1); 3362 off = op0; 3363 continue; 3364 } 3365 break; 3366 case SSA_NAME: 3367 off = op0; 3368 continue; 3369 CASE_CONVERT: 3370 if (!POINTER_TYPE_P (TREE_TYPE (op0)) 3371 && !INTEGRAL_TYPE_P (TREE_TYPE (op0))) 3372 break; 3373 if (TYPE_PRECISION (TREE_TYPE (op0)) 3374 == TYPE_PRECISION (TREE_TYPE (off))) 3375 { 3376 off = op0; 3377 continue; 3378 } 3379 if (TYPE_PRECISION (TREE_TYPE (op0)) 3380 < TYPE_PRECISION (TREE_TYPE (off))) 3381 { 3382 off = op0; 3383 offtype = TREE_TYPE (off); 3384 STRIP_NOPS (off); 3385 continue; 3386 } 3387 break; 3388 default: 3389 break; 3390 } 3391 break; 3392 } 3393 3394 /* If at the end OFF still isn't a SSA_NAME or isn't 3395 defined in the loop, punt. */ 3396 if (TREE_CODE (off) != SSA_NAME 3397 || expr_invariant_in_loop_p (loop, off)) 3398 return NULL_TREE; 3399 3400 if (offtype == NULL_TREE) 3401 offtype = TREE_TYPE (off); 3402 3403 if (DR_IS_READ (dr)) 3404 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info), 3405 offtype, scale); 3406 else 3407 decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info), 3408 offtype, scale); 3409 3410 if (decl == NULL_TREE) 3411 return NULL_TREE; 3412 3413 if (basep) 3414 *basep = base; 3415 if (offp) 3416 *offp = off; 3417 if (scalep) 3418 *scalep = scale; 3419 return decl; 3420 } 3421 3422 /* Function vect_analyze_data_refs. 3423 3424 Find all the data references in the loop or basic block. 3425 3426 The general structure of the analysis of data refs in the vectorizer is as 3427 follows: 3428 1- vect_analyze_data_refs(loop/bb): call 3429 compute_data_dependences_for_loop/bb to find and analyze all data-refs 3430 in the loop/bb and their dependences. 3431 2- vect_analyze_dependences(): apply dependence testing using ddrs. 3432 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok. 3433 4- vect_analyze_drs_access(): check that ref_stmt.step is ok. 3434 3435 */ 3436 3437 bool 3438 vect_analyze_data_refs (vec_info *vinfo, int *min_vf) 3439 { 3440 struct loop *loop = NULL; 3441 unsigned int i; 3442 struct data_reference *dr; 3443 tree scalar_type; 3444 3445 if (dump_enabled_p ()) 3446 dump_printf_loc (MSG_NOTE, vect_location, 3447 "=== vect_analyze_data_refs ===\n"); 3448 3449 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo)) 3450 loop = LOOP_VINFO_LOOP (loop_vinfo); 3451 3452 /* Go through the data-refs, check that the analysis succeeded. Update 3453 pointer from stmt_vec_info struct to DR and vectype. */ 3454 3455 vec<data_reference_p> datarefs = vinfo->datarefs; 3456 FOR_EACH_VEC_ELT (datarefs, i, dr) 3457 { 3458 gimple *stmt; 3459 stmt_vec_info stmt_info; 3460 tree base, offset, init; 3461 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE; 3462 bool simd_lane_access = false; 3463 int vf; 3464 3465 again: 3466 if (!dr || !DR_REF (dr)) 3467 { 3468 if (dump_enabled_p ()) 3469 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3470 "not vectorized: unhandled data-ref\n"); 3471 return false; 3472 } 3473 3474 stmt = DR_STMT (dr); 3475 stmt_info = vinfo_for_stmt (stmt); 3476 3477 /* Discard clobbers from the dataref vector. We will remove 3478 clobber stmts during vectorization. */ 3479 if (gimple_clobber_p (stmt)) 3480 { 3481 free_data_ref (dr); 3482 if (i == datarefs.length () - 1) 3483 { 3484 datarefs.pop (); 3485 break; 3486 } 3487 datarefs.ordered_remove (i); 3488 dr = datarefs[i]; 3489 goto again; 3490 } 3491 3492 /* Check that analysis of the data-ref succeeded. */ 3493 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr) 3494 || !DR_STEP (dr)) 3495 { 3496 bool maybe_gather 3497 = DR_IS_READ (dr) 3498 && !TREE_THIS_VOLATILE (DR_REF (dr)) 3499 && targetm.vectorize.builtin_gather != NULL; 3500 bool maybe_scatter 3501 = DR_IS_WRITE (dr) 3502 && !TREE_THIS_VOLATILE (DR_REF (dr)) 3503 && targetm.vectorize.builtin_scatter != NULL; 3504 bool maybe_simd_lane_access 3505 = is_a <loop_vec_info> (vinfo) && loop->simduid; 3506 3507 /* If target supports vector gather loads or scatter stores, or if 3508 this might be a SIMD lane access, see if they can't be used. */ 3509 if (is_a <loop_vec_info> (vinfo) 3510 && (maybe_gather || maybe_scatter || maybe_simd_lane_access) 3511 && !nested_in_vect_loop_p (loop, stmt)) 3512 { 3513 struct data_reference *newdr 3514 = create_data_ref (NULL, loop_containing_stmt (stmt), 3515 DR_REF (dr), stmt, maybe_scatter ? false : true); 3516 gcc_assert (newdr != NULL && DR_REF (newdr)); 3517 if (DR_BASE_ADDRESS (newdr) 3518 && DR_OFFSET (newdr) 3519 && DR_INIT (newdr) 3520 && DR_STEP (newdr) 3521 && integer_zerop (DR_STEP (newdr))) 3522 { 3523 if (maybe_simd_lane_access) 3524 { 3525 tree off = DR_OFFSET (newdr); 3526 STRIP_NOPS (off); 3527 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST 3528 && TREE_CODE (off) == MULT_EXPR 3529 && tree_fits_uhwi_p (TREE_OPERAND (off, 1))) 3530 { 3531 tree step = TREE_OPERAND (off, 1); 3532 off = TREE_OPERAND (off, 0); 3533 STRIP_NOPS (off); 3534 if (CONVERT_EXPR_P (off) 3535 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 3536 0))) 3537 < TYPE_PRECISION (TREE_TYPE (off))) 3538 off = TREE_OPERAND (off, 0); 3539 if (TREE_CODE (off) == SSA_NAME) 3540 { 3541 gimple *def = SSA_NAME_DEF_STMT (off); 3542 tree reft = TREE_TYPE (DR_REF (newdr)); 3543 if (is_gimple_call (def) 3544 && gimple_call_internal_p (def) 3545 && (gimple_call_internal_fn (def) 3546 == IFN_GOMP_SIMD_LANE)) 3547 { 3548 tree arg = gimple_call_arg (def, 0); 3549 gcc_assert (TREE_CODE (arg) == SSA_NAME); 3550 arg = SSA_NAME_VAR (arg); 3551 if (arg == loop->simduid 3552 /* For now. */ 3553 && tree_int_cst_equal 3554 (TYPE_SIZE_UNIT (reft), 3555 step)) 3556 { 3557 DR_OFFSET (newdr) = ssize_int (0); 3558 DR_STEP (newdr) = step; 3559 DR_ALIGNED_TO (newdr) 3560 = size_int (BIGGEST_ALIGNMENT); 3561 dr = newdr; 3562 simd_lane_access = true; 3563 } 3564 } 3565 } 3566 } 3567 } 3568 if (!simd_lane_access && (maybe_gather || maybe_scatter)) 3569 { 3570 dr = newdr; 3571 if (maybe_gather) 3572 gatherscatter = GATHER; 3573 else 3574 gatherscatter = SCATTER; 3575 } 3576 } 3577 if (gatherscatter == SG_NONE && !simd_lane_access) 3578 free_data_ref (newdr); 3579 } 3580 3581 if (gatherscatter == SG_NONE && !simd_lane_access) 3582 { 3583 if (dump_enabled_p ()) 3584 { 3585 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3586 "not vectorized: data ref analysis " 3587 "failed "); 3588 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 3589 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 3590 } 3591 3592 if (is_a <bb_vec_info> (vinfo)) 3593 break; 3594 3595 return false; 3596 } 3597 } 3598 3599 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST) 3600 { 3601 if (dump_enabled_p ()) 3602 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3603 "not vectorized: base addr of dr is a " 3604 "constant\n"); 3605 3606 if (is_a <bb_vec_info> (vinfo)) 3607 break; 3608 3609 if (gatherscatter != SG_NONE || simd_lane_access) 3610 free_data_ref (dr); 3611 return false; 3612 } 3613 3614 if (TREE_THIS_VOLATILE (DR_REF (dr))) 3615 { 3616 if (dump_enabled_p ()) 3617 { 3618 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3619 "not vectorized: volatile type "); 3620 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 3621 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 3622 } 3623 3624 if (is_a <bb_vec_info> (vinfo)) 3625 break; 3626 3627 return false; 3628 } 3629 3630 if (stmt_can_throw_internal (stmt)) 3631 { 3632 if (dump_enabled_p ()) 3633 { 3634 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3635 "not vectorized: statement can throw an " 3636 "exception "); 3637 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 3638 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 3639 } 3640 3641 if (is_a <bb_vec_info> (vinfo)) 3642 break; 3643 3644 if (gatherscatter != SG_NONE || simd_lane_access) 3645 free_data_ref (dr); 3646 return false; 3647 } 3648 3649 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF 3650 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1))) 3651 { 3652 if (dump_enabled_p ()) 3653 { 3654 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3655 "not vectorized: statement is bitfield " 3656 "access "); 3657 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 3658 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 3659 } 3660 3661 if (is_a <bb_vec_info> (vinfo)) 3662 break; 3663 3664 if (gatherscatter != SG_NONE || simd_lane_access) 3665 free_data_ref (dr); 3666 return false; 3667 } 3668 3669 base = unshare_expr (DR_BASE_ADDRESS (dr)); 3670 offset = unshare_expr (DR_OFFSET (dr)); 3671 init = unshare_expr (DR_INIT (dr)); 3672 3673 if (is_gimple_call (stmt) 3674 && (!gimple_call_internal_p (stmt) 3675 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD 3676 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE))) 3677 { 3678 if (dump_enabled_p ()) 3679 { 3680 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3681 "not vectorized: dr in a call "); 3682 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 3683 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 3684 } 3685 3686 if (is_a <bb_vec_info> (vinfo)) 3687 break; 3688 3689 if (gatherscatter != SG_NONE || simd_lane_access) 3690 free_data_ref (dr); 3691 return false; 3692 } 3693 3694 /* Update DR field in stmt_vec_info struct. */ 3695 3696 /* If the dataref is in an inner-loop of the loop that is considered for 3697 for vectorization, we also want to analyze the access relative to 3698 the outer-loop (DR contains information only relative to the 3699 inner-most enclosing loop). We do that by building a reference to the 3700 first location accessed by the inner-loop, and analyze it relative to 3701 the outer-loop. */ 3702 if (loop && nested_in_vect_loop_p (loop, stmt)) 3703 { 3704 tree outer_step, outer_base, outer_init; 3705 HOST_WIDE_INT pbitsize, pbitpos; 3706 tree poffset; 3707 machine_mode pmode; 3708 int punsignedp, preversep, pvolatilep; 3709 affine_iv base_iv, offset_iv; 3710 tree dinit; 3711 3712 /* Build a reference to the first location accessed by the 3713 inner-loop: *(BASE+INIT). (The first location is actually 3714 BASE+INIT+OFFSET, but we add OFFSET separately later). */ 3715 tree inner_base = build_fold_indirect_ref 3716 (fold_build_pointer_plus (base, init)); 3717 3718 if (dump_enabled_p ()) 3719 { 3720 dump_printf_loc (MSG_NOTE, vect_location, 3721 "analyze in outer-loop: "); 3722 dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base); 3723 dump_printf (MSG_NOTE, "\n"); 3724 } 3725 3726 outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos, 3727 &poffset, &pmode, &punsignedp, 3728 &preversep, &pvolatilep, false); 3729 gcc_assert (outer_base != NULL_TREE); 3730 3731 if (pbitpos % BITS_PER_UNIT != 0) 3732 { 3733 if (dump_enabled_p ()) 3734 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3735 "failed: bit offset alignment.\n"); 3736 return false; 3737 } 3738 3739 if (preversep) 3740 { 3741 if (dump_enabled_p ()) 3742 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3743 "failed: reverse storage order.\n"); 3744 return false; 3745 } 3746 3747 outer_base = build_fold_addr_expr (outer_base); 3748 if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base, 3749 &base_iv, false)) 3750 { 3751 if (dump_enabled_p ()) 3752 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3753 "failed: evolution of base is not affine.\n"); 3754 return false; 3755 } 3756 3757 if (offset) 3758 { 3759 if (poffset) 3760 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset, 3761 poffset); 3762 else 3763 poffset = offset; 3764 } 3765 3766 if (!poffset) 3767 { 3768 offset_iv.base = ssize_int (0); 3769 offset_iv.step = ssize_int (0); 3770 } 3771 else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset, 3772 &offset_iv, false)) 3773 { 3774 if (dump_enabled_p ()) 3775 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3776 "evolution of offset is not affine.\n"); 3777 return false; 3778 } 3779 3780 outer_init = ssize_int (pbitpos / BITS_PER_UNIT); 3781 split_constant_offset (base_iv.base, &base_iv.base, &dinit); 3782 outer_init = size_binop (PLUS_EXPR, outer_init, dinit); 3783 split_constant_offset (offset_iv.base, &offset_iv.base, &dinit); 3784 outer_init = size_binop (PLUS_EXPR, outer_init, dinit); 3785 3786 outer_step = size_binop (PLUS_EXPR, 3787 fold_convert (ssizetype, base_iv.step), 3788 fold_convert (ssizetype, offset_iv.step)); 3789 3790 STMT_VINFO_DR_STEP (stmt_info) = outer_step; 3791 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */ 3792 STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base; 3793 STMT_VINFO_DR_INIT (stmt_info) = outer_init; 3794 STMT_VINFO_DR_OFFSET (stmt_info) = 3795 fold_convert (ssizetype, offset_iv.base); 3796 STMT_VINFO_DR_ALIGNED_TO (stmt_info) = 3797 size_int (highest_pow2_factor (offset_iv.base)); 3798 3799 if (dump_enabled_p ()) 3800 { 3801 dump_printf_loc (MSG_NOTE, vect_location, 3802 "\touter base_address: "); 3803 dump_generic_expr (MSG_NOTE, TDF_SLIM, 3804 STMT_VINFO_DR_BASE_ADDRESS (stmt_info)); 3805 dump_printf (MSG_NOTE, "\n\touter offset from base address: "); 3806 dump_generic_expr (MSG_NOTE, TDF_SLIM, 3807 STMT_VINFO_DR_OFFSET (stmt_info)); 3808 dump_printf (MSG_NOTE, 3809 "\n\touter constant offset from base address: "); 3810 dump_generic_expr (MSG_NOTE, TDF_SLIM, 3811 STMT_VINFO_DR_INIT (stmt_info)); 3812 dump_printf (MSG_NOTE, "\n\touter step: "); 3813 dump_generic_expr (MSG_NOTE, TDF_SLIM, 3814 STMT_VINFO_DR_STEP (stmt_info)); 3815 dump_printf (MSG_NOTE, "\n\touter aligned to: "); 3816 dump_generic_expr (MSG_NOTE, TDF_SLIM, 3817 STMT_VINFO_DR_ALIGNED_TO (stmt_info)); 3818 dump_printf (MSG_NOTE, "\n"); 3819 } 3820 } 3821 3822 if (STMT_VINFO_DATA_REF (stmt_info)) 3823 { 3824 if (dump_enabled_p ()) 3825 { 3826 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3827 "not vectorized: more than one data ref " 3828 "in stmt: "); 3829 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 3830 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 3831 } 3832 3833 if (is_a <bb_vec_info> (vinfo)) 3834 break; 3835 3836 if (gatherscatter != SG_NONE || simd_lane_access) 3837 free_data_ref (dr); 3838 return false; 3839 } 3840 3841 STMT_VINFO_DATA_REF (stmt_info) = dr; 3842 if (simd_lane_access) 3843 { 3844 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true; 3845 free_data_ref (datarefs[i]); 3846 datarefs[i] = dr; 3847 } 3848 3849 /* Set vectype for STMT. */ 3850 scalar_type = TREE_TYPE (DR_REF (dr)); 3851 STMT_VINFO_VECTYPE (stmt_info) 3852 = get_vectype_for_scalar_type (scalar_type); 3853 if (!STMT_VINFO_VECTYPE (stmt_info)) 3854 { 3855 if (dump_enabled_p ()) 3856 { 3857 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3858 "not vectorized: no vectype for stmt: "); 3859 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 3860 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: "); 3861 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS, 3862 scalar_type); 3863 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 3864 } 3865 3866 if (is_a <bb_vec_info> (vinfo)) 3867 { 3868 /* No vector type is fine, the ref can still participate 3869 in dependence analysis, we just can't vectorize it. */ 3870 STMT_VINFO_VECTORIZABLE (stmt_info) = false; 3871 continue; 3872 } 3873 3874 if (gatherscatter != SG_NONE || simd_lane_access) 3875 { 3876 STMT_VINFO_DATA_REF (stmt_info) = NULL; 3877 if (gatherscatter != SG_NONE) 3878 free_data_ref (dr); 3879 } 3880 return false; 3881 } 3882 else 3883 { 3884 if (dump_enabled_p ()) 3885 { 3886 dump_printf_loc (MSG_NOTE, vect_location, 3887 "got vectype for stmt: "); 3888 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0); 3889 dump_generic_expr (MSG_NOTE, TDF_SLIM, 3890 STMT_VINFO_VECTYPE (stmt_info)); 3891 dump_printf (MSG_NOTE, "\n"); 3892 } 3893 } 3894 3895 /* Adjust the minimal vectorization factor according to the 3896 vector type. */ 3897 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)); 3898 if (vf > *min_vf) 3899 *min_vf = vf; 3900 3901 if (gatherscatter != SG_NONE) 3902 { 3903 tree off; 3904 if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo), 3905 NULL, &off, NULL) 3906 || get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE) 3907 { 3908 STMT_VINFO_DATA_REF (stmt_info) = NULL; 3909 free_data_ref (dr); 3910 if (dump_enabled_p ()) 3911 { 3912 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3913 (gatherscatter == GATHER) ? 3914 "not vectorized: not suitable for gather " 3915 "load " : 3916 "not vectorized: not suitable for scatter " 3917 "store "); 3918 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 3919 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 3920 } 3921 return false; 3922 } 3923 3924 free_data_ref (datarefs[i]); 3925 datarefs[i] = dr; 3926 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter; 3927 } 3928 3929 else if (is_a <loop_vec_info> (vinfo) 3930 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST) 3931 { 3932 if (nested_in_vect_loop_p (loop, stmt)) 3933 { 3934 if (dump_enabled_p ()) 3935 { 3936 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3937 "not vectorized: not suitable for strided " 3938 "load "); 3939 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 3940 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 3941 } 3942 return false; 3943 } 3944 STMT_VINFO_STRIDED_P (stmt_info) = true; 3945 } 3946 } 3947 3948 /* If we stopped analysis at the first dataref we could not analyze 3949 when trying to vectorize a basic-block mark the rest of the datarefs 3950 as not vectorizable and truncate the vector of datarefs. That 3951 avoids spending useless time in analyzing their dependence. */ 3952 if (i != datarefs.length ()) 3953 { 3954 gcc_assert (is_a <bb_vec_info> (vinfo)); 3955 for (unsigned j = i; j < datarefs.length (); ++j) 3956 { 3957 data_reference_p dr = datarefs[j]; 3958 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false; 3959 free_data_ref (dr); 3960 } 3961 datarefs.truncate (i); 3962 } 3963 3964 return true; 3965 } 3966 3967 3968 /* Function vect_get_new_vect_var. 3969 3970 Returns a name for a new variable. The current naming scheme appends the 3971 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to 3972 the name of vectorizer generated variables, and appends that to NAME if 3973 provided. */ 3974 3975 tree 3976 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name) 3977 { 3978 const char *prefix; 3979 tree new_vect_var; 3980 3981 switch (var_kind) 3982 { 3983 case vect_simple_var: 3984 prefix = "vect"; 3985 break; 3986 case vect_scalar_var: 3987 prefix = "stmp"; 3988 break; 3989 case vect_mask_var: 3990 prefix = "mask"; 3991 break; 3992 case vect_pointer_var: 3993 prefix = "vectp"; 3994 break; 3995 default: 3996 gcc_unreachable (); 3997 } 3998 3999 if (name) 4000 { 4001 char* tmp = concat (prefix, "_", name, NULL); 4002 new_vect_var = create_tmp_reg (type, tmp); 4003 free (tmp); 4004 } 4005 else 4006 new_vect_var = create_tmp_reg (type, prefix); 4007 4008 return new_vect_var; 4009 } 4010 4011 /* Like vect_get_new_vect_var but return an SSA name. */ 4012 4013 tree 4014 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name) 4015 { 4016 const char *prefix; 4017 tree new_vect_var; 4018 4019 switch (var_kind) 4020 { 4021 case vect_simple_var: 4022 prefix = "vect"; 4023 break; 4024 case vect_scalar_var: 4025 prefix = "stmp"; 4026 break; 4027 case vect_pointer_var: 4028 prefix = "vectp"; 4029 break; 4030 default: 4031 gcc_unreachable (); 4032 } 4033 4034 if (name) 4035 { 4036 char* tmp = concat (prefix, "_", name, NULL); 4037 new_vect_var = make_temp_ssa_name (type, NULL, tmp); 4038 free (tmp); 4039 } 4040 else 4041 new_vect_var = make_temp_ssa_name (type, NULL, prefix); 4042 4043 return new_vect_var; 4044 } 4045 4046 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */ 4047 4048 static void 4049 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr, 4050 stmt_vec_info stmt_info) 4051 { 4052 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr)); 4053 unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info)); 4054 int misalign = DR_MISALIGNMENT (dr); 4055 if (misalign == -1) 4056 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name)); 4057 else 4058 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), align, misalign); 4059 } 4060 4061 /* Function vect_create_addr_base_for_vector_ref. 4062 4063 Create an expression that computes the address of the first memory location 4064 that will be accessed for a data reference. 4065 4066 Input: 4067 STMT: The statement containing the data reference. 4068 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list. 4069 OFFSET: Optional. If supplied, it is be added to the initial address. 4070 LOOP: Specify relative to which loop-nest should the address be computed. 4071 For example, when the dataref is in an inner-loop nested in an 4072 outer-loop that is now being vectorized, LOOP can be either the 4073 outer-loop, or the inner-loop. The first memory location accessed 4074 by the following dataref ('in' points to short): 4075 4076 for (i=0; i<N; i++) 4077 for (j=0; j<M; j++) 4078 s += in[i+j] 4079 4080 is as follows: 4081 if LOOP=i_loop: &in (relative to i_loop) 4082 if LOOP=j_loop: &in+i*2B (relative to j_loop) 4083 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the 4084 initial address. Unlike OFFSET, which is number of elements to 4085 be added, BYTE_OFFSET is measured in bytes. 4086 4087 Output: 4088 1. Return an SSA_NAME whose value is the address of the memory location of 4089 the first vector of the data reference. 4090 2. If new_stmt_list is not NULL_TREE after return then the caller must insert 4091 these statement(s) which define the returned SSA_NAME. 4092 4093 FORNOW: We are only handling array accesses with step 1. */ 4094 4095 tree 4096 vect_create_addr_base_for_vector_ref (gimple *stmt, 4097 gimple_seq *new_stmt_list, 4098 tree offset, 4099 struct loop *loop, 4100 tree byte_offset) 4101 { 4102 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 4103 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); 4104 tree data_ref_base; 4105 const char *base_name; 4106 tree addr_base; 4107 tree dest; 4108 gimple_seq seq = NULL; 4109 tree base_offset; 4110 tree init; 4111 tree vect_ptr_type; 4112 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr))); 4113 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 4114 4115 if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father) 4116 { 4117 struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo); 4118 4119 gcc_assert (nested_in_vect_loop_p (outer_loop, stmt)); 4120 4121 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info)); 4122 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info)); 4123 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info)); 4124 } 4125 else 4126 { 4127 data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr)); 4128 base_offset = unshare_expr (DR_OFFSET (dr)); 4129 init = unshare_expr (DR_INIT (dr)); 4130 } 4131 4132 if (loop_vinfo) 4133 base_name = get_name (data_ref_base); 4134 else 4135 { 4136 base_offset = ssize_int (0); 4137 init = ssize_int (0); 4138 base_name = get_name (DR_REF (dr)); 4139 } 4140 4141 /* Create base_offset */ 4142 base_offset = size_binop (PLUS_EXPR, 4143 fold_convert (sizetype, base_offset), 4144 fold_convert (sizetype, init)); 4145 4146 if (offset) 4147 { 4148 offset = fold_build2 (MULT_EXPR, sizetype, 4149 fold_convert (sizetype, offset), step); 4150 base_offset = fold_build2 (PLUS_EXPR, sizetype, 4151 base_offset, offset); 4152 } 4153 if (byte_offset) 4154 { 4155 byte_offset = fold_convert (sizetype, byte_offset); 4156 base_offset = fold_build2 (PLUS_EXPR, sizetype, 4157 base_offset, byte_offset); 4158 } 4159 4160 /* base + base_offset */ 4161 if (loop_vinfo) 4162 addr_base = fold_build_pointer_plus (data_ref_base, base_offset); 4163 else 4164 { 4165 addr_base = build1 (ADDR_EXPR, 4166 build_pointer_type (TREE_TYPE (DR_REF (dr))), 4167 unshare_expr (DR_REF (dr))); 4168 } 4169 4170 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info)); 4171 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name); 4172 addr_base = force_gimple_operand (addr_base, &seq, true, dest); 4173 gimple_seq_add_seq (new_stmt_list, seq); 4174 4175 if (DR_PTR_INFO (dr) 4176 && TREE_CODE (addr_base) == SSA_NAME 4177 && !SSA_NAME_PTR_INFO (addr_base)) 4178 { 4179 vect_duplicate_ssa_name_ptr_info (addr_base, dr, stmt_info); 4180 if (offset || byte_offset) 4181 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base)); 4182 } 4183 4184 if (dump_enabled_p ()) 4185 { 4186 dump_printf_loc (MSG_NOTE, vect_location, "created "); 4187 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base); 4188 dump_printf (MSG_NOTE, "\n"); 4189 } 4190 4191 return addr_base; 4192 } 4193 4194 4195 /* Function vect_create_data_ref_ptr. 4196 4197 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first 4198 location accessed in the loop by STMT, along with the def-use update 4199 chain to appropriately advance the pointer through the loop iterations. 4200 Also set aliasing information for the pointer. This pointer is used by 4201 the callers to this function to create a memory reference expression for 4202 vector load/store access. 4203 4204 Input: 4205 1. STMT: a stmt that references memory. Expected to be of the form 4206 GIMPLE_ASSIGN <name, data-ref> or 4207 GIMPLE_ASSIGN <data-ref, name>. 4208 2. AGGR_TYPE: the type of the reference, which should be either a vector 4209 or an array. 4210 3. AT_LOOP: the loop where the vector memref is to be created. 4211 4. OFFSET (optional): an offset to be added to the initial address accessed 4212 by the data-ref in STMT. 4213 5. BSI: location where the new stmts are to be placed if there is no loop 4214 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain 4215 pointing to the initial address. 4216 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added 4217 to the initial address accessed by the data-ref in STMT. This is 4218 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET 4219 in bytes. 4220 4221 Output: 4222 1. Declare a new ptr to vector_type, and have it point to the base of the 4223 data reference (initial addressed accessed by the data reference). 4224 For example, for vector of type V8HI, the following code is generated: 4225 4226 v8hi *ap; 4227 ap = (v8hi *)initial_address; 4228 4229 if OFFSET is not supplied: 4230 initial_address = &a[init]; 4231 if OFFSET is supplied: 4232 initial_address = &a[init + OFFSET]; 4233 if BYTE_OFFSET is supplied: 4234 initial_address = &a[init] + BYTE_OFFSET; 4235 4236 Return the initial_address in INITIAL_ADDRESS. 4237 4238 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also 4239 update the pointer in each iteration of the loop. 4240 4241 Return the increment stmt that updates the pointer in PTR_INCR. 4242 4243 3. Set INV_P to true if the access pattern of the data reference in the 4244 vectorized loop is invariant. Set it to false otherwise. 4245 4246 4. Return the pointer. */ 4247 4248 tree 4249 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop, 4250 tree offset, tree *initial_address, 4251 gimple_stmt_iterator *gsi, gimple **ptr_incr, 4252 bool only_init, bool *inv_p, tree byte_offset) 4253 { 4254 const char *base_name; 4255 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 4256 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 4257 struct loop *loop = NULL; 4258 bool nested_in_vect_loop = false; 4259 struct loop *containing_loop = NULL; 4260 tree aggr_ptr_type; 4261 tree aggr_ptr; 4262 tree new_temp; 4263 gimple_seq new_stmt_list = NULL; 4264 edge pe = NULL; 4265 basic_block new_bb; 4266 tree aggr_ptr_init; 4267 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); 4268 tree aptr; 4269 gimple_stmt_iterator incr_gsi; 4270 bool insert_after; 4271 tree indx_before_incr, indx_after_incr; 4272 gimple *incr; 4273 tree step; 4274 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info); 4275 4276 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE 4277 || TREE_CODE (aggr_type) == VECTOR_TYPE); 4278 4279 if (loop_vinfo) 4280 { 4281 loop = LOOP_VINFO_LOOP (loop_vinfo); 4282 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt); 4283 containing_loop = (gimple_bb (stmt))->loop_father; 4284 pe = loop_preheader_edge (loop); 4285 } 4286 else 4287 { 4288 gcc_assert (bb_vinfo); 4289 only_init = true; 4290 *ptr_incr = NULL; 4291 } 4292 4293 /* Check the step (evolution) of the load in LOOP, and record 4294 whether it's invariant. */ 4295 if (nested_in_vect_loop) 4296 step = STMT_VINFO_DR_STEP (stmt_info); 4297 else 4298 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info)); 4299 4300 if (integer_zerop (step)) 4301 *inv_p = true; 4302 else 4303 *inv_p = false; 4304 4305 /* Create an expression for the first address accessed by this load 4306 in LOOP. */ 4307 base_name = get_name (DR_BASE_ADDRESS (dr)); 4308 4309 if (dump_enabled_p ()) 4310 { 4311 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr)); 4312 dump_printf_loc (MSG_NOTE, vect_location, 4313 "create %s-pointer variable to type: ", 4314 get_tree_code_name (TREE_CODE (aggr_type))); 4315 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type); 4316 if (TREE_CODE (dr_base_type) == ARRAY_TYPE) 4317 dump_printf (MSG_NOTE, " vectorizing an array ref: "); 4318 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE) 4319 dump_printf (MSG_NOTE, " vectorizing a vector ref: "); 4320 else if (TREE_CODE (dr_base_type) == RECORD_TYPE) 4321 dump_printf (MSG_NOTE, " vectorizing a record based array ref: "); 4322 else 4323 dump_printf (MSG_NOTE, " vectorizing a pointer ref: "); 4324 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr)); 4325 dump_printf (MSG_NOTE, "\n"); 4326 } 4327 4328 /* (1) Create the new aggregate-pointer variable. 4329 Vector and array types inherit the alias set of their component 4330 type by default so we need to use a ref-all pointer if the data 4331 reference does not conflict with the created aggregated data 4332 reference because it is not addressable. */ 4333 bool need_ref_all = false; 4334 if (!alias_sets_conflict_p (get_alias_set (aggr_type), 4335 get_alias_set (DR_REF (dr)))) 4336 need_ref_all = true; 4337 /* Likewise for any of the data references in the stmt group. */ 4338 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1) 4339 { 4340 gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info); 4341 do 4342 { 4343 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt); 4344 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo); 4345 if (!alias_sets_conflict_p (get_alias_set (aggr_type), 4346 get_alias_set (DR_REF (sdr)))) 4347 { 4348 need_ref_all = true; 4349 break; 4350 } 4351 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo); 4352 } 4353 while (orig_stmt); 4354 } 4355 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode, 4356 need_ref_all); 4357 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name); 4358 4359 4360 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are 4361 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two 4362 def-use update cycles for the pointer: one relative to the outer-loop 4363 (LOOP), which is what steps (3) and (4) below do. The other is relative 4364 to the inner-loop (which is the inner-most loop containing the dataref), 4365 and this is done be step (5) below. 4366 4367 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the 4368 inner-most loop, and so steps (3),(4) work the same, and step (5) is 4369 redundant. Steps (3),(4) create the following: 4370 4371 vp0 = &base_addr; 4372 LOOP: vp1 = phi(vp0,vp2) 4373 ... 4374 ... 4375 vp2 = vp1 + step 4376 goto LOOP 4377 4378 If there is an inner-loop nested in loop, then step (5) will also be 4379 applied, and an additional update in the inner-loop will be created: 4380 4381 vp0 = &base_addr; 4382 LOOP: vp1 = phi(vp0,vp2) 4383 ... 4384 inner: vp3 = phi(vp1,vp4) 4385 vp4 = vp3 + inner_step 4386 if () goto inner 4387 ... 4388 vp2 = vp1 + step 4389 if () goto LOOP */ 4390 4391 /* (2) Calculate the initial address of the aggregate-pointer, and set 4392 the aggregate-pointer to point to it before the loop. */ 4393 4394 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */ 4395 4396 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list, 4397 offset, loop, byte_offset); 4398 if (new_stmt_list) 4399 { 4400 if (pe) 4401 { 4402 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list); 4403 gcc_assert (!new_bb); 4404 } 4405 else 4406 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT); 4407 } 4408 4409 *initial_address = new_temp; 4410 aggr_ptr_init = new_temp; 4411 4412 /* (3) Handle the updating of the aggregate-pointer inside the loop. 4413 This is needed when ONLY_INIT is false, and also when AT_LOOP is the 4414 inner-loop nested in LOOP (during outer-loop vectorization). */ 4415 4416 /* No update in loop is required. */ 4417 if (only_init && (!loop_vinfo || at_loop == loop)) 4418 aptr = aggr_ptr_init; 4419 else 4420 { 4421 /* The step of the aggregate pointer is the type size. */ 4422 tree iv_step = TYPE_SIZE_UNIT (aggr_type); 4423 /* One exception to the above is when the scalar step of the load in 4424 LOOP is zero. In this case the step here is also zero. */ 4425 if (*inv_p) 4426 iv_step = size_zero_node; 4427 else if (tree_int_cst_sgn (step) == -1) 4428 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step); 4429 4430 standard_iv_increment_position (loop, &incr_gsi, &insert_after); 4431 4432 create_iv (aggr_ptr_init, 4433 fold_convert (aggr_ptr_type, iv_step), 4434 aggr_ptr, loop, &incr_gsi, insert_after, 4435 &indx_before_incr, &indx_after_incr); 4436 incr = gsi_stmt (incr_gsi); 4437 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo)); 4438 4439 /* Copy the points-to information if it exists. */ 4440 if (DR_PTR_INFO (dr)) 4441 { 4442 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info); 4443 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info); 4444 } 4445 if (ptr_incr) 4446 *ptr_incr = incr; 4447 4448 aptr = indx_before_incr; 4449 } 4450 4451 if (!nested_in_vect_loop || only_init) 4452 return aptr; 4453 4454 4455 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop 4456 nested in LOOP, if exists. */ 4457 4458 gcc_assert (nested_in_vect_loop); 4459 if (!only_init) 4460 { 4461 standard_iv_increment_position (containing_loop, &incr_gsi, 4462 &insert_after); 4463 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr, 4464 containing_loop, &incr_gsi, insert_after, &indx_before_incr, 4465 &indx_after_incr); 4466 incr = gsi_stmt (incr_gsi); 4467 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo)); 4468 4469 /* Copy the points-to information if it exists. */ 4470 if (DR_PTR_INFO (dr)) 4471 { 4472 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info); 4473 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info); 4474 } 4475 if (ptr_incr) 4476 *ptr_incr = incr; 4477 4478 return indx_before_incr; 4479 } 4480 else 4481 gcc_unreachable (); 4482 } 4483 4484 4485 /* Function bump_vector_ptr 4486 4487 Increment a pointer (to a vector type) by vector-size. If requested, 4488 i.e. if PTR-INCR is given, then also connect the new increment stmt 4489 to the existing def-use update-chain of the pointer, by modifying 4490 the PTR_INCR as illustrated below: 4491 4492 The pointer def-use update-chain before this function: 4493 DATAREF_PTR = phi (p_0, p_2) 4494 .... 4495 PTR_INCR: p_2 = DATAREF_PTR + step 4496 4497 The pointer def-use update-chain after this function: 4498 DATAREF_PTR = phi (p_0, p_2) 4499 .... 4500 NEW_DATAREF_PTR = DATAREF_PTR + BUMP 4501 .... 4502 PTR_INCR: p_2 = NEW_DATAREF_PTR + step 4503 4504 Input: 4505 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated 4506 in the loop. 4507 PTR_INCR - optional. The stmt that updates the pointer in each iteration of 4508 the loop. The increment amount across iterations is expected 4509 to be vector_size. 4510 BSI - location where the new update stmt is to be placed. 4511 STMT - the original scalar memory-access stmt that is being vectorized. 4512 BUMP - optional. The offset by which to bump the pointer. If not given, 4513 the offset is assumed to be vector_size. 4514 4515 Output: Return NEW_DATAREF_PTR as illustrated above. 4516 4517 */ 4518 4519 tree 4520 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi, 4521 gimple *stmt, tree bump) 4522 { 4523 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 4524 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); 4525 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 4526 tree update = TYPE_SIZE_UNIT (vectype); 4527 gassign *incr_stmt; 4528 ssa_op_iter iter; 4529 use_operand_p use_p; 4530 tree new_dataref_ptr; 4531 4532 if (bump) 4533 update = bump; 4534 4535 if (TREE_CODE (dataref_ptr) == SSA_NAME) 4536 new_dataref_ptr = copy_ssa_name (dataref_ptr); 4537 else 4538 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr)); 4539 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR, 4540 dataref_ptr, update); 4541 vect_finish_stmt_generation (stmt, incr_stmt, gsi); 4542 4543 /* Copy the points-to information if it exists. */ 4544 if (DR_PTR_INFO (dr)) 4545 { 4546 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr)); 4547 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr)); 4548 } 4549 4550 if (!ptr_incr) 4551 return new_dataref_ptr; 4552 4553 /* Update the vector-pointer's cross-iteration increment. */ 4554 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE) 4555 { 4556 tree use = USE_FROM_PTR (use_p); 4557 4558 if (use == dataref_ptr) 4559 SET_USE (use_p, new_dataref_ptr); 4560 else 4561 gcc_assert (tree_int_cst_compare (use, update) == 0); 4562 } 4563 4564 return new_dataref_ptr; 4565 } 4566 4567 4568 /* Function vect_create_destination_var. 4569 4570 Create a new temporary of type VECTYPE. */ 4571 4572 tree 4573 vect_create_destination_var (tree scalar_dest, tree vectype) 4574 { 4575 tree vec_dest; 4576 const char *name; 4577 char *new_name; 4578 tree type; 4579 enum vect_var_kind kind; 4580 4581 kind = vectype 4582 ? VECTOR_BOOLEAN_TYPE_P (vectype) 4583 ? vect_mask_var 4584 : vect_simple_var 4585 : vect_scalar_var; 4586 type = vectype ? vectype : TREE_TYPE (scalar_dest); 4587 4588 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME); 4589 4590 name = get_name (scalar_dest); 4591 if (name) 4592 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest)); 4593 else 4594 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest)); 4595 vec_dest = vect_get_new_vect_var (type, kind, new_name); 4596 free (new_name); 4597 4598 return vec_dest; 4599 } 4600 4601 /* Function vect_grouped_store_supported. 4602 4603 Returns TRUE if interleave high and interleave low permutations 4604 are supported, and FALSE otherwise. */ 4605 4606 bool 4607 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count) 4608 { 4609 machine_mode mode = TYPE_MODE (vectype); 4610 4611 /* vect_permute_store_chain requires the group size to be equal to 3 or 4612 be a power of two. */ 4613 if (count != 3 && exact_log2 (count) == -1) 4614 { 4615 if (dump_enabled_p ()) 4616 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 4617 "the size of the group of accesses" 4618 " is not a power of 2 or not eqaul to 3\n"); 4619 return false; 4620 } 4621 4622 /* Check that the permutation is supported. */ 4623 if (VECTOR_MODE_P (mode)) 4624 { 4625 unsigned int i, nelt = GET_MODE_NUNITS (mode); 4626 unsigned char *sel = XALLOCAVEC (unsigned char, nelt); 4627 4628 if (count == 3) 4629 { 4630 unsigned int j0 = 0, j1 = 0, j2 = 0; 4631 unsigned int i, j; 4632 4633 for (j = 0; j < 3; j++) 4634 { 4635 int nelt0 = ((3 - j) * nelt) % 3; 4636 int nelt1 = ((3 - j) * nelt + 1) % 3; 4637 int nelt2 = ((3 - j) * nelt + 2) % 3; 4638 for (i = 0; i < nelt; i++) 4639 { 4640 if (3 * i + nelt0 < nelt) 4641 sel[3 * i + nelt0] = j0++; 4642 if (3 * i + nelt1 < nelt) 4643 sel[3 * i + nelt1] = nelt + j1++; 4644 if (3 * i + nelt2 < nelt) 4645 sel[3 * i + nelt2] = 0; 4646 } 4647 if (!can_vec_perm_p (mode, false, sel)) 4648 { 4649 if (dump_enabled_p ()) 4650 dump_printf (MSG_MISSED_OPTIMIZATION, 4651 "permutaion op not supported by target.\n"); 4652 return false; 4653 } 4654 4655 for (i = 0; i < nelt; i++) 4656 { 4657 if (3 * i + nelt0 < nelt) 4658 sel[3 * i + nelt0] = 3 * i + nelt0; 4659 if (3 * i + nelt1 < nelt) 4660 sel[3 * i + nelt1] = 3 * i + nelt1; 4661 if (3 * i + nelt2 < nelt) 4662 sel[3 * i + nelt2] = nelt + j2++; 4663 } 4664 if (!can_vec_perm_p (mode, false, sel)) 4665 { 4666 if (dump_enabled_p ()) 4667 dump_printf (MSG_MISSED_OPTIMIZATION, 4668 "permutaion op not supported by target.\n"); 4669 return false; 4670 } 4671 } 4672 return true; 4673 } 4674 else 4675 { 4676 /* If length is not equal to 3 then only power of 2 is supported. */ 4677 gcc_assert (exact_log2 (count) != -1); 4678 4679 for (i = 0; i < nelt / 2; i++) 4680 { 4681 sel[i * 2] = i; 4682 sel[i * 2 + 1] = i + nelt; 4683 } 4684 if (can_vec_perm_p (mode, false, sel)) 4685 { 4686 for (i = 0; i < nelt; i++) 4687 sel[i] += nelt / 2; 4688 if (can_vec_perm_p (mode, false, sel)) 4689 return true; 4690 } 4691 } 4692 } 4693 4694 if (dump_enabled_p ()) 4695 dump_printf (MSG_MISSED_OPTIMIZATION, 4696 "permutaion op not supported by target.\n"); 4697 return false; 4698 } 4699 4700 4701 /* Return TRUE if vec_store_lanes is available for COUNT vectors of 4702 type VECTYPE. */ 4703 4704 bool 4705 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count) 4706 { 4707 return vect_lanes_optab_supported_p ("vec_store_lanes", 4708 vec_store_lanes_optab, 4709 vectype, count); 4710 } 4711 4712 4713 /* Function vect_permute_store_chain. 4714 4715 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be 4716 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder 4717 the data correctly for the stores. Return the final references for stores 4718 in RESULT_CHAIN. 4719 4720 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8. 4721 The input is 4 vectors each containing 8 elements. We assign a number to 4722 each element, the input sequence is: 4723 4724 1st vec: 0 1 2 3 4 5 6 7 4725 2nd vec: 8 9 10 11 12 13 14 15 4726 3rd vec: 16 17 18 19 20 21 22 23 4727 4th vec: 24 25 26 27 28 29 30 31 4728 4729 The output sequence should be: 4730 4731 1st vec: 0 8 16 24 1 9 17 25 4732 2nd vec: 2 10 18 26 3 11 19 27 4733 3rd vec: 4 12 20 28 5 13 21 30 4734 4th vec: 6 14 22 30 7 15 23 31 4735 4736 i.e., we interleave the contents of the four vectors in their order. 4737 4738 We use interleave_high/low instructions to create such output. The input of 4739 each interleave_high/low operation is two vectors: 4740 1st vec 2nd vec 4741 0 1 2 3 4 5 6 7 4742 the even elements of the result vector are obtained left-to-right from the 4743 high/low elements of the first vector. The odd elements of the result are 4744 obtained left-to-right from the high/low elements of the second vector. 4745 The output of interleave_high will be: 0 4 1 5 4746 and of interleave_low: 2 6 3 7 4747 4748 4749 The permutation is done in log LENGTH stages. In each stage interleave_high 4750 and interleave_low stmts are created for each pair of vectors in DR_CHAIN, 4751 where the first argument is taken from the first half of DR_CHAIN and the 4752 second argument from it's second half. 4753 In our example, 4754 4755 I1: interleave_high (1st vec, 3rd vec) 4756 I2: interleave_low (1st vec, 3rd vec) 4757 I3: interleave_high (2nd vec, 4th vec) 4758 I4: interleave_low (2nd vec, 4th vec) 4759 4760 The output for the first stage is: 4761 4762 I1: 0 16 1 17 2 18 3 19 4763 I2: 4 20 5 21 6 22 7 23 4764 I3: 8 24 9 25 10 26 11 27 4765 I4: 12 28 13 29 14 30 15 31 4766 4767 The output of the second stage, i.e. the final result is: 4768 4769 I1: 0 8 16 24 1 9 17 25 4770 I2: 2 10 18 26 3 11 19 27 4771 I3: 4 12 20 28 5 13 21 30 4772 I4: 6 14 22 30 7 15 23 31. */ 4773 4774 void 4775 vect_permute_store_chain (vec<tree> dr_chain, 4776 unsigned int length, 4777 gimple *stmt, 4778 gimple_stmt_iterator *gsi, 4779 vec<tree> *result_chain) 4780 { 4781 tree vect1, vect2, high, low; 4782 gimple *perm_stmt; 4783 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)); 4784 tree perm_mask_low, perm_mask_high; 4785 tree data_ref; 4786 tree perm3_mask_low, perm3_mask_high; 4787 unsigned int i, n, log_length = exact_log2 (length); 4788 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype); 4789 unsigned char *sel = XALLOCAVEC (unsigned char, nelt); 4790 4791 result_chain->quick_grow (length); 4792 memcpy (result_chain->address (), dr_chain.address (), 4793 length * sizeof (tree)); 4794 4795 if (length == 3) 4796 { 4797 unsigned int j0 = 0, j1 = 0, j2 = 0; 4798 4799 for (j = 0; j < 3; j++) 4800 { 4801 int nelt0 = ((3 - j) * nelt) % 3; 4802 int nelt1 = ((3 - j) * nelt + 1) % 3; 4803 int nelt2 = ((3 - j) * nelt + 2) % 3; 4804 4805 for (i = 0; i < nelt; i++) 4806 { 4807 if (3 * i + nelt0 < nelt) 4808 sel[3 * i + nelt0] = j0++; 4809 if (3 * i + nelt1 < nelt) 4810 sel[3 * i + nelt1] = nelt + j1++; 4811 if (3 * i + nelt2 < nelt) 4812 sel[3 * i + nelt2] = 0; 4813 } 4814 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel); 4815 4816 for (i = 0; i < nelt; i++) 4817 { 4818 if (3 * i + nelt0 < nelt) 4819 sel[3 * i + nelt0] = 3 * i + nelt0; 4820 if (3 * i + nelt1 < nelt) 4821 sel[3 * i + nelt1] = 3 * i + nelt1; 4822 if (3 * i + nelt2 < nelt) 4823 sel[3 * i + nelt2] = nelt + j2++; 4824 } 4825 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel); 4826 4827 vect1 = dr_chain[0]; 4828 vect2 = dr_chain[1]; 4829 4830 /* Create interleaving stmt: 4831 low = VEC_PERM_EXPR <vect1, vect2, 4832 {j, nelt, *, j + 1, nelt + j + 1, *, 4833 j + 2, nelt + j + 2, *, ...}> */ 4834 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low"); 4835 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1, 4836 vect2, perm3_mask_low); 4837 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 4838 4839 vect1 = data_ref; 4840 vect2 = dr_chain[2]; 4841 /* Create interleaving stmt: 4842 low = VEC_PERM_EXPR <vect1, vect2, 4843 {0, 1, nelt + j, 3, 4, nelt + j + 1, 4844 6, 7, nelt + j + 2, ...}> */ 4845 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high"); 4846 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1, 4847 vect2, perm3_mask_high); 4848 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 4849 (*result_chain)[j] = data_ref; 4850 } 4851 } 4852 else 4853 { 4854 /* If length is not equal to 3 then only power of 2 is supported. */ 4855 gcc_assert (exact_log2 (length) != -1); 4856 4857 for (i = 0, n = nelt / 2; i < n; i++) 4858 { 4859 sel[i * 2] = i; 4860 sel[i * 2 + 1] = i + nelt; 4861 } 4862 perm_mask_high = vect_gen_perm_mask_checked (vectype, sel); 4863 4864 for (i = 0; i < nelt; i++) 4865 sel[i] += nelt / 2; 4866 perm_mask_low = vect_gen_perm_mask_checked (vectype, sel); 4867 4868 for (i = 0, n = log_length; i < n; i++) 4869 { 4870 for (j = 0; j < length/2; j++) 4871 { 4872 vect1 = dr_chain[j]; 4873 vect2 = dr_chain[j+length/2]; 4874 4875 /* Create interleaving stmt: 4876 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1, 4877 ...}> */ 4878 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high"); 4879 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1, 4880 vect2, perm_mask_high); 4881 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 4882 (*result_chain)[2*j] = high; 4883 4884 /* Create interleaving stmt: 4885 low = VEC_PERM_EXPR <vect1, vect2, 4886 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1, 4887 ...}> */ 4888 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low"); 4889 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1, 4890 vect2, perm_mask_low); 4891 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 4892 (*result_chain)[2*j+1] = low; 4893 } 4894 memcpy (dr_chain.address (), result_chain->address (), 4895 length * sizeof (tree)); 4896 } 4897 } 4898 } 4899 4900 /* Function vect_setup_realignment 4901 4902 This function is called when vectorizing an unaligned load using 4903 the dr_explicit_realign[_optimized] scheme. 4904 This function generates the following code at the loop prolog: 4905 4906 p = initial_addr; 4907 x msq_init = *(floor(p)); # prolog load 4908 realignment_token = call target_builtin; 4909 loop: 4910 x msq = phi (msq_init, ---) 4911 4912 The stmts marked with x are generated only for the case of 4913 dr_explicit_realign_optimized. 4914 4915 The code above sets up a new (vector) pointer, pointing to the first 4916 location accessed by STMT, and a "floor-aligned" load using that pointer. 4917 It also generates code to compute the "realignment-token" (if the relevant 4918 target hook was defined), and creates a phi-node at the loop-header bb 4919 whose arguments are the result of the prolog-load (created by this 4920 function) and the result of a load that takes place in the loop (to be 4921 created by the caller to this function). 4922 4923 For the case of dr_explicit_realign_optimized: 4924 The caller to this function uses the phi-result (msq) to create the 4925 realignment code inside the loop, and sets up the missing phi argument, 4926 as follows: 4927 loop: 4928 msq = phi (msq_init, lsq) 4929 lsq = *(floor(p')); # load in loop 4930 result = realign_load (msq, lsq, realignment_token); 4931 4932 For the case of dr_explicit_realign: 4933 loop: 4934 msq = *(floor(p)); # load in loop 4935 p' = p + (VS-1); 4936 lsq = *(floor(p')); # load in loop 4937 result = realign_load (msq, lsq, realignment_token); 4938 4939 Input: 4940 STMT - (scalar) load stmt to be vectorized. This load accesses 4941 a memory location that may be unaligned. 4942 BSI - place where new code is to be inserted. 4943 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes 4944 is used. 4945 4946 Output: 4947 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load 4948 target hook, if defined. 4949 Return value - the result of the loop-header phi node. */ 4950 4951 tree 4952 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi, 4953 tree *realignment_token, 4954 enum dr_alignment_support alignment_support_scheme, 4955 tree init_addr, 4956 struct loop **at_loop) 4957 { 4958 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 4959 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 4960 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 4961 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); 4962 struct loop *loop = NULL; 4963 edge pe = NULL; 4964 tree scalar_dest = gimple_assign_lhs (stmt); 4965 tree vec_dest; 4966 gimple *inc; 4967 tree ptr; 4968 tree data_ref; 4969 basic_block new_bb; 4970 tree msq_init = NULL_TREE; 4971 tree new_temp; 4972 gphi *phi_stmt; 4973 tree msq = NULL_TREE; 4974 gimple_seq stmts = NULL; 4975 bool inv_p; 4976 bool compute_in_loop = false; 4977 bool nested_in_vect_loop = false; 4978 struct loop *containing_loop = (gimple_bb (stmt))->loop_father; 4979 struct loop *loop_for_initial_load = NULL; 4980 4981 if (loop_vinfo) 4982 { 4983 loop = LOOP_VINFO_LOOP (loop_vinfo); 4984 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt); 4985 } 4986 4987 gcc_assert (alignment_support_scheme == dr_explicit_realign 4988 || alignment_support_scheme == dr_explicit_realign_optimized); 4989 4990 /* We need to generate three things: 4991 1. the misalignment computation 4992 2. the extra vector load (for the optimized realignment scheme). 4993 3. the phi node for the two vectors from which the realignment is 4994 done (for the optimized realignment scheme). */ 4995 4996 /* 1. Determine where to generate the misalignment computation. 4997 4998 If INIT_ADDR is NULL_TREE, this indicates that the misalignment 4999 calculation will be generated by this function, outside the loop (in the 5000 preheader). Otherwise, INIT_ADDR had already been computed for us by the 5001 caller, inside the loop. 5002 5003 Background: If the misalignment remains fixed throughout the iterations of 5004 the loop, then both realignment schemes are applicable, and also the 5005 misalignment computation can be done outside LOOP. This is because we are 5006 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that 5007 are a multiple of VS (the Vector Size), and therefore the misalignment in 5008 different vectorized LOOP iterations is always the same. 5009 The problem arises only if the memory access is in an inner-loop nested 5010 inside LOOP, which is now being vectorized using outer-loop vectorization. 5011 This is the only case when the misalignment of the memory access may not 5012 remain fixed throughout the iterations of the inner-loop (as explained in 5013 detail in vect_supportable_dr_alignment). In this case, not only is the 5014 optimized realignment scheme not applicable, but also the misalignment 5015 computation (and generation of the realignment token that is passed to 5016 REALIGN_LOAD) have to be done inside the loop. 5017 5018 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode 5019 or not, which in turn determines if the misalignment is computed inside 5020 the inner-loop, or outside LOOP. */ 5021 5022 if (init_addr != NULL_TREE || !loop_vinfo) 5023 { 5024 compute_in_loop = true; 5025 gcc_assert (alignment_support_scheme == dr_explicit_realign); 5026 } 5027 5028 5029 /* 2. Determine where to generate the extra vector load. 5030 5031 For the optimized realignment scheme, instead of generating two vector 5032 loads in each iteration, we generate a single extra vector load in the 5033 preheader of the loop, and in each iteration reuse the result of the 5034 vector load from the previous iteration. In case the memory access is in 5035 an inner-loop nested inside LOOP, which is now being vectorized using 5036 outer-loop vectorization, we need to determine whether this initial vector 5037 load should be generated at the preheader of the inner-loop, or can be 5038 generated at the preheader of LOOP. If the memory access has no evolution 5039 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has 5040 to be generated inside LOOP (in the preheader of the inner-loop). */ 5041 5042 if (nested_in_vect_loop) 5043 { 5044 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info); 5045 bool invariant_in_outerloop = 5046 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0); 5047 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner); 5048 } 5049 else 5050 loop_for_initial_load = loop; 5051 if (at_loop) 5052 *at_loop = loop_for_initial_load; 5053 5054 if (loop_for_initial_load) 5055 pe = loop_preheader_edge (loop_for_initial_load); 5056 5057 /* 3. For the case of the optimized realignment, create the first vector 5058 load at the loop preheader. */ 5059 5060 if (alignment_support_scheme == dr_explicit_realign_optimized) 5061 { 5062 /* Create msq_init = *(floor(p1)) in the loop preheader */ 5063 gassign *new_stmt; 5064 5065 gcc_assert (!compute_in_loop); 5066 vec_dest = vect_create_destination_var (scalar_dest, vectype); 5067 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load, 5068 NULL_TREE, &init_addr, NULL, &inc, 5069 true, &inv_p); 5070 if (TREE_CODE (ptr) == SSA_NAME) 5071 new_temp = copy_ssa_name (ptr); 5072 else 5073 new_temp = make_ssa_name (TREE_TYPE (ptr)); 5074 new_stmt = gimple_build_assign 5075 (new_temp, BIT_AND_EXPR, ptr, 5076 build_int_cst (TREE_TYPE (ptr), 5077 -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype))); 5078 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt); 5079 gcc_assert (!new_bb); 5080 data_ref 5081 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp, 5082 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0)); 5083 new_stmt = gimple_build_assign (vec_dest, data_ref); 5084 new_temp = make_ssa_name (vec_dest, new_stmt); 5085 gimple_assign_set_lhs (new_stmt, new_temp); 5086 if (pe) 5087 { 5088 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt); 5089 gcc_assert (!new_bb); 5090 } 5091 else 5092 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); 5093 5094 msq_init = gimple_assign_lhs (new_stmt); 5095 } 5096 5097 /* 4. Create realignment token using a target builtin, if available. 5098 It is done either inside the containing loop, or before LOOP (as 5099 determined above). */ 5100 5101 if (targetm.vectorize.builtin_mask_for_load) 5102 { 5103 gcall *new_stmt; 5104 tree builtin_decl; 5105 5106 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */ 5107 if (!init_addr) 5108 { 5109 /* Generate the INIT_ADDR computation outside LOOP. */ 5110 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts, 5111 NULL_TREE, loop); 5112 if (loop) 5113 { 5114 pe = loop_preheader_edge (loop); 5115 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); 5116 gcc_assert (!new_bb); 5117 } 5118 else 5119 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); 5120 } 5121 5122 builtin_decl = targetm.vectorize.builtin_mask_for_load (); 5123 new_stmt = gimple_build_call (builtin_decl, 1, init_addr); 5124 vec_dest = 5125 vect_create_destination_var (scalar_dest, 5126 gimple_call_return_type (new_stmt)); 5127 new_temp = make_ssa_name (vec_dest, new_stmt); 5128 gimple_call_set_lhs (new_stmt, new_temp); 5129 5130 if (compute_in_loop) 5131 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); 5132 else 5133 { 5134 /* Generate the misalignment computation outside LOOP. */ 5135 pe = loop_preheader_edge (loop); 5136 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt); 5137 gcc_assert (!new_bb); 5138 } 5139 5140 *realignment_token = gimple_call_lhs (new_stmt); 5141 5142 /* The result of the CALL_EXPR to this builtin is determined from 5143 the value of the parameter and no global variables are touched 5144 which makes the builtin a "const" function. Requiring the 5145 builtin to have the "const" attribute makes it unnecessary 5146 to call mark_call_clobbered. */ 5147 gcc_assert (TREE_READONLY (builtin_decl)); 5148 } 5149 5150 if (alignment_support_scheme == dr_explicit_realign) 5151 return msq; 5152 5153 gcc_assert (!compute_in_loop); 5154 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized); 5155 5156 5157 /* 5. Create msq = phi <msq_init, lsq> in loop */ 5158 5159 pe = loop_preheader_edge (containing_loop); 5160 vec_dest = vect_create_destination_var (scalar_dest, vectype); 5161 msq = make_ssa_name (vec_dest); 5162 phi_stmt = create_phi_node (msq, containing_loop->header); 5163 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION); 5164 5165 return msq; 5166 } 5167 5168 5169 /* Function vect_grouped_load_supported. 5170 5171 Returns TRUE if even and odd permutations are supported, 5172 and FALSE otherwise. */ 5173 5174 bool 5175 vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count) 5176 { 5177 machine_mode mode = TYPE_MODE (vectype); 5178 5179 /* vect_permute_load_chain requires the group size to be equal to 3 or 5180 be a power of two. */ 5181 if (count != 3 && exact_log2 (count) == -1) 5182 { 5183 if (dump_enabled_p ()) 5184 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5185 "the size of the group of accesses" 5186 " is not a power of 2 or not equal to 3\n"); 5187 return false; 5188 } 5189 5190 /* Check that the permutation is supported. */ 5191 if (VECTOR_MODE_P (mode)) 5192 { 5193 unsigned int i, j, nelt = GET_MODE_NUNITS (mode); 5194 unsigned char *sel = XALLOCAVEC (unsigned char, nelt); 5195 5196 if (count == 3) 5197 { 5198 unsigned int k; 5199 for (k = 0; k < 3; k++) 5200 { 5201 for (i = 0; i < nelt; i++) 5202 if (3 * i + k < 2 * nelt) 5203 sel[i] = 3 * i + k; 5204 else 5205 sel[i] = 0; 5206 if (!can_vec_perm_p (mode, false, sel)) 5207 { 5208 if (dump_enabled_p ()) 5209 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5210 "shuffle of 3 loads is not supported by" 5211 " target\n"); 5212 return false; 5213 } 5214 for (i = 0, j = 0; i < nelt; i++) 5215 if (3 * i + k < 2 * nelt) 5216 sel[i] = i; 5217 else 5218 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++); 5219 if (!can_vec_perm_p (mode, false, sel)) 5220 { 5221 if (dump_enabled_p ()) 5222 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5223 "shuffle of 3 loads is not supported by" 5224 " target\n"); 5225 return false; 5226 } 5227 } 5228 return true; 5229 } 5230 else 5231 { 5232 /* If length is not equal to 3 then only power of 2 is supported. */ 5233 gcc_assert (exact_log2 (count) != -1); 5234 for (i = 0; i < nelt; i++) 5235 sel[i] = i * 2; 5236 if (can_vec_perm_p (mode, false, sel)) 5237 { 5238 for (i = 0; i < nelt; i++) 5239 sel[i] = i * 2 + 1; 5240 if (can_vec_perm_p (mode, false, sel)) 5241 return true; 5242 } 5243 } 5244 } 5245 5246 if (dump_enabled_p ()) 5247 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5248 "extract even/odd not supported by target\n"); 5249 return false; 5250 } 5251 5252 /* Return TRUE if vec_load_lanes is available for COUNT vectors of 5253 type VECTYPE. */ 5254 5255 bool 5256 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count) 5257 { 5258 return vect_lanes_optab_supported_p ("vec_load_lanes", 5259 vec_load_lanes_optab, 5260 vectype, count); 5261 } 5262 5263 /* Function vect_permute_load_chain. 5264 5265 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be 5266 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder 5267 the input data correctly. Return the final references for loads in 5268 RESULT_CHAIN. 5269 5270 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8. 5271 The input is 4 vectors each containing 8 elements. We assign a number to each 5272 element, the input sequence is: 5273 5274 1st vec: 0 1 2 3 4 5 6 7 5275 2nd vec: 8 9 10 11 12 13 14 15 5276 3rd vec: 16 17 18 19 20 21 22 23 5277 4th vec: 24 25 26 27 28 29 30 31 5278 5279 The output sequence should be: 5280 5281 1st vec: 0 4 8 12 16 20 24 28 5282 2nd vec: 1 5 9 13 17 21 25 29 5283 3rd vec: 2 6 10 14 18 22 26 30 5284 4th vec: 3 7 11 15 19 23 27 31 5285 5286 i.e., the first output vector should contain the first elements of each 5287 interleaving group, etc. 5288 5289 We use extract_even/odd instructions to create such output. The input of 5290 each extract_even/odd operation is two vectors 5291 1st vec 2nd vec 5292 0 1 2 3 4 5 6 7 5293 5294 and the output is the vector of extracted even/odd elements. The output of 5295 extract_even will be: 0 2 4 6 5296 and of extract_odd: 1 3 5 7 5297 5298 5299 The permutation is done in log LENGTH stages. In each stage extract_even 5300 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in 5301 their order. In our example, 5302 5303 E1: extract_even (1st vec, 2nd vec) 5304 E2: extract_odd (1st vec, 2nd vec) 5305 E3: extract_even (3rd vec, 4th vec) 5306 E4: extract_odd (3rd vec, 4th vec) 5307 5308 The output for the first stage will be: 5309 5310 E1: 0 2 4 6 8 10 12 14 5311 E2: 1 3 5 7 9 11 13 15 5312 E3: 16 18 20 22 24 26 28 30 5313 E4: 17 19 21 23 25 27 29 31 5314 5315 In order to proceed and create the correct sequence for the next stage (or 5316 for the correct output, if the second stage is the last one, as in our 5317 example), we first put the output of extract_even operation and then the 5318 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN). 5319 The input for the second stage is: 5320 5321 1st vec (E1): 0 2 4 6 8 10 12 14 5322 2nd vec (E3): 16 18 20 22 24 26 28 30 5323 3rd vec (E2): 1 3 5 7 9 11 13 15 5324 4th vec (E4): 17 19 21 23 25 27 29 31 5325 5326 The output of the second stage: 5327 5328 E1: 0 4 8 12 16 20 24 28 5329 E2: 2 6 10 14 18 22 26 30 5330 E3: 1 5 9 13 17 21 25 29 5331 E4: 3 7 11 15 19 23 27 31 5332 5333 And RESULT_CHAIN after reordering: 5334 5335 1st vec (E1): 0 4 8 12 16 20 24 28 5336 2nd vec (E3): 1 5 9 13 17 21 25 29 5337 3rd vec (E2): 2 6 10 14 18 22 26 30 5338 4th vec (E4): 3 7 11 15 19 23 27 31. */ 5339 5340 static void 5341 vect_permute_load_chain (vec<tree> dr_chain, 5342 unsigned int length, 5343 gimple *stmt, 5344 gimple_stmt_iterator *gsi, 5345 vec<tree> *result_chain) 5346 { 5347 tree data_ref, first_vect, second_vect; 5348 tree perm_mask_even, perm_mask_odd; 5349 tree perm3_mask_low, perm3_mask_high; 5350 gimple *perm_stmt; 5351 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)); 5352 unsigned int i, j, log_length = exact_log2 (length); 5353 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype); 5354 unsigned char *sel = XALLOCAVEC (unsigned char, nelt); 5355 5356 result_chain->quick_grow (length); 5357 memcpy (result_chain->address (), dr_chain.address (), 5358 length * sizeof (tree)); 5359 5360 if (length == 3) 5361 { 5362 unsigned int k; 5363 5364 for (k = 0; k < 3; k++) 5365 { 5366 for (i = 0; i < nelt; i++) 5367 if (3 * i + k < 2 * nelt) 5368 sel[i] = 3 * i + k; 5369 else 5370 sel[i] = 0; 5371 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel); 5372 5373 for (i = 0, j = 0; i < nelt; i++) 5374 if (3 * i + k < 2 * nelt) 5375 sel[i] = i; 5376 else 5377 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++); 5378 5379 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel); 5380 5381 first_vect = dr_chain[0]; 5382 second_vect = dr_chain[1]; 5383 5384 /* Create interleaving stmt (low part of): 5385 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k, 5386 ...}> */ 5387 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low"); 5388 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect, 5389 second_vect, perm3_mask_low); 5390 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5391 5392 /* Create interleaving stmt (high part of): 5393 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k, 5394 ...}> */ 5395 first_vect = data_ref; 5396 second_vect = dr_chain[2]; 5397 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high"); 5398 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect, 5399 second_vect, perm3_mask_high); 5400 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5401 (*result_chain)[k] = data_ref; 5402 } 5403 } 5404 else 5405 { 5406 /* If length is not equal to 3 then only power of 2 is supported. */ 5407 gcc_assert (exact_log2 (length) != -1); 5408 5409 for (i = 0; i < nelt; ++i) 5410 sel[i] = i * 2; 5411 perm_mask_even = vect_gen_perm_mask_checked (vectype, sel); 5412 5413 for (i = 0; i < nelt; ++i) 5414 sel[i] = i * 2 + 1; 5415 perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel); 5416 5417 for (i = 0; i < log_length; i++) 5418 { 5419 for (j = 0; j < length; j += 2) 5420 { 5421 first_vect = dr_chain[j]; 5422 second_vect = dr_chain[j+1]; 5423 5424 /* data_ref = permute_even (first_data_ref, second_data_ref); */ 5425 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even"); 5426 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 5427 first_vect, second_vect, 5428 perm_mask_even); 5429 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5430 (*result_chain)[j/2] = data_ref; 5431 5432 /* data_ref = permute_odd (first_data_ref, second_data_ref); */ 5433 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd"); 5434 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 5435 first_vect, second_vect, 5436 perm_mask_odd); 5437 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5438 (*result_chain)[j/2+length/2] = data_ref; 5439 } 5440 memcpy (dr_chain.address (), result_chain->address (), 5441 length * sizeof (tree)); 5442 } 5443 } 5444 } 5445 5446 /* Function vect_shift_permute_load_chain. 5447 5448 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate 5449 sequence of stmts to reorder the input data accordingly. 5450 Return the final references for loads in RESULT_CHAIN. 5451 Return true if successed, false otherwise. 5452 5453 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8. 5454 The input is 3 vectors each containing 8 elements. We assign a 5455 number to each element, the input sequence is: 5456 5457 1st vec: 0 1 2 3 4 5 6 7 5458 2nd vec: 8 9 10 11 12 13 14 15 5459 3rd vec: 16 17 18 19 20 21 22 23 5460 5461 The output sequence should be: 5462 5463 1st vec: 0 3 6 9 12 15 18 21 5464 2nd vec: 1 4 7 10 13 16 19 22 5465 3rd vec: 2 5 8 11 14 17 20 23 5466 5467 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output. 5468 5469 First we shuffle all 3 vectors to get correct elements order: 5470 5471 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5) 5472 2nd vec: ( 8 11 14) ( 9 12 15) (10 13) 5473 3rd vec: (16 19 22) (17 20 23) (18 21) 5474 5475 Next we unite and shift vector 3 times: 5476 5477 1st step: 5478 shift right by 6 the concatenation of: 5479 "1st vec" and "2nd vec" 5480 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13) 5481 "2nd vec" and "3rd vec" 5482 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21) 5483 "3rd vec" and "1st vec" 5484 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5) 5485 | New vectors | 5486 5487 So that now new vectors are: 5488 5489 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15) 5490 2nd vec: (10 13) (16 19 22) (17 20 23) 5491 3rd vec: (18 21) ( 0 3 6) ( 1 4 7) 5492 5493 2nd step: 5494 shift right by 5 the concatenation of: 5495 "1st vec" and "3rd vec" 5496 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7) 5497 "2nd vec" and "1st vec" 5498 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15) 5499 "3rd vec" and "2nd vec" 5500 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23) 5501 | New vectors | 5502 5503 So that now new vectors are: 5504 5505 1st vec: ( 9 12 15) (18 21) ( 0 3 6) 5506 2nd vec: (17 20 23) ( 2 5) ( 8 11 14) 5507 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY 5508 5509 3rd step: 5510 shift right by 5 the concatenation of: 5511 "1st vec" and "1st vec" 5512 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6) 5513 shift right by 3 the concatenation of: 5514 "2nd vec" and "2nd vec" 5515 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14) 5516 | New vectors | 5517 5518 So that now all vectors are READY: 5519 1st vec: ( 0 3 6) ( 9 12 15) (18 21) 5520 2nd vec: ( 2 5) ( 8 11 14) (17 20 23) 5521 3rd vec: ( 1 4 7) (10 13) (16 19 22) 5522 5523 This algorithm is faster than one in vect_permute_load_chain if: 5524 1. "shift of a concatination" is faster than general permutation. 5525 This is usually so. 5526 2. The TARGET machine can't execute vector instructions in parallel. 5527 This is because each step of the algorithm depends on previous. 5528 The algorithm in vect_permute_load_chain is much more parallel. 5529 5530 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF. 5531 */ 5532 5533 static bool 5534 vect_shift_permute_load_chain (vec<tree> dr_chain, 5535 unsigned int length, 5536 gimple *stmt, 5537 gimple_stmt_iterator *gsi, 5538 vec<tree> *result_chain) 5539 { 5540 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect; 5541 tree perm2_mask1, perm2_mask2, perm3_mask; 5542 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask; 5543 gimple *perm_stmt; 5544 5545 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)); 5546 unsigned int i; 5547 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype); 5548 unsigned char *sel = XALLOCAVEC (unsigned char, nelt); 5549 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 5550 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 5551 5552 result_chain->quick_grow (length); 5553 memcpy (result_chain->address (), dr_chain.address (), 5554 length * sizeof (tree)); 5555 5556 if (exact_log2 (length) != -1 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4) 5557 { 5558 unsigned int j, log_length = exact_log2 (length); 5559 for (i = 0; i < nelt / 2; ++i) 5560 sel[i] = i * 2; 5561 for (i = 0; i < nelt / 2; ++i) 5562 sel[nelt / 2 + i] = i * 2 + 1; 5563 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) 5564 { 5565 if (dump_enabled_p ()) 5566 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5567 "shuffle of 2 fields structure is not \ 5568 supported by target\n"); 5569 return false; 5570 } 5571 perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel); 5572 5573 for (i = 0; i < nelt / 2; ++i) 5574 sel[i] = i * 2 + 1; 5575 for (i = 0; i < nelt / 2; ++i) 5576 sel[nelt / 2 + i] = i * 2; 5577 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) 5578 { 5579 if (dump_enabled_p ()) 5580 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5581 "shuffle of 2 fields structure is not \ 5582 supported by target\n"); 5583 return false; 5584 } 5585 perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel); 5586 5587 /* Generating permutation constant to shift all elements. 5588 For vector length 8 it is {4 5 6 7 8 9 10 11}. */ 5589 for (i = 0; i < nelt; i++) 5590 sel[i] = nelt / 2 + i; 5591 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) 5592 { 5593 if (dump_enabled_p ()) 5594 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5595 "shift permutation is not supported by target\n"); 5596 return false; 5597 } 5598 shift1_mask = vect_gen_perm_mask_checked (vectype, sel); 5599 5600 /* Generating permutation constant to select vector from 2. 5601 For vector length 8 it is {0 1 2 3 12 13 14 15}. */ 5602 for (i = 0; i < nelt / 2; i++) 5603 sel[i] = i; 5604 for (i = nelt / 2; i < nelt; i++) 5605 sel[i] = nelt + i; 5606 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) 5607 { 5608 if (dump_enabled_p ()) 5609 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5610 "select is not supported by target\n"); 5611 return false; 5612 } 5613 select_mask = vect_gen_perm_mask_checked (vectype, sel); 5614 5615 for (i = 0; i < log_length; i++) 5616 { 5617 for (j = 0; j < length; j += 2) 5618 { 5619 first_vect = dr_chain[j]; 5620 second_vect = dr_chain[j + 1]; 5621 5622 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2"); 5623 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 5624 first_vect, first_vect, 5625 perm2_mask1); 5626 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5627 vect[0] = data_ref; 5628 5629 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2"); 5630 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 5631 second_vect, second_vect, 5632 perm2_mask2); 5633 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5634 vect[1] = data_ref; 5635 5636 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift"); 5637 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 5638 vect[0], vect[1], shift1_mask); 5639 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5640 (*result_chain)[j/2 + length/2] = data_ref; 5641 5642 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select"); 5643 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 5644 vect[0], vect[1], select_mask); 5645 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5646 (*result_chain)[j/2] = data_ref; 5647 } 5648 memcpy (dr_chain.address (), result_chain->address (), 5649 length * sizeof (tree)); 5650 } 5651 return true; 5652 } 5653 if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2) 5654 { 5655 unsigned int k = 0, l = 0; 5656 5657 /* Generating permutation constant to get all elements in rigth order. 5658 For vector length 8 it is {0 3 6 1 4 7 2 5}. */ 5659 for (i = 0; i < nelt; i++) 5660 { 5661 if (3 * k + (l % 3) >= nelt) 5662 { 5663 k = 0; 5664 l += (3 - (nelt % 3)); 5665 } 5666 sel[i] = 3 * k + (l % 3); 5667 k++; 5668 } 5669 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) 5670 { 5671 if (dump_enabled_p ()) 5672 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5673 "shuffle of 3 fields structure is not \ 5674 supported by target\n"); 5675 return false; 5676 } 5677 perm3_mask = vect_gen_perm_mask_checked (vectype, sel); 5678 5679 /* Generating permutation constant to shift all elements. 5680 For vector length 8 it is {6 7 8 9 10 11 12 13}. */ 5681 for (i = 0; i < nelt; i++) 5682 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i; 5683 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) 5684 { 5685 if (dump_enabled_p ()) 5686 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5687 "shift permutation is not supported by target\n"); 5688 return false; 5689 } 5690 shift1_mask = vect_gen_perm_mask_checked (vectype, sel); 5691 5692 /* Generating permutation constant to shift all elements. 5693 For vector length 8 it is {5 6 7 8 9 10 11 12}. */ 5694 for (i = 0; i < nelt; i++) 5695 sel[i] = 2 * (nelt / 3) + 1 + i; 5696 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) 5697 { 5698 if (dump_enabled_p ()) 5699 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5700 "shift permutation is not supported by target\n"); 5701 return false; 5702 } 5703 shift2_mask = vect_gen_perm_mask_checked (vectype, sel); 5704 5705 /* Generating permutation constant to shift all elements. 5706 For vector length 8 it is {3 4 5 6 7 8 9 10}. */ 5707 for (i = 0; i < nelt; i++) 5708 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i; 5709 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) 5710 { 5711 if (dump_enabled_p ()) 5712 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5713 "shift permutation is not supported by target\n"); 5714 return false; 5715 } 5716 shift3_mask = vect_gen_perm_mask_checked (vectype, sel); 5717 5718 /* Generating permutation constant to shift all elements. 5719 For vector length 8 it is {5 6 7 8 9 10 11 12}. */ 5720 for (i = 0; i < nelt; i++) 5721 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i; 5722 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) 5723 { 5724 if (dump_enabled_p ()) 5725 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5726 "shift permutation is not supported by target\n"); 5727 return false; 5728 } 5729 shift4_mask = vect_gen_perm_mask_checked (vectype, sel); 5730 5731 for (k = 0; k < 3; k++) 5732 { 5733 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3"); 5734 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 5735 dr_chain[k], dr_chain[k], 5736 perm3_mask); 5737 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5738 vect[k] = data_ref; 5739 } 5740 5741 for (k = 0; k < 3; k++) 5742 { 5743 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1"); 5744 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 5745 vect[k % 3], vect[(k + 1) % 3], 5746 shift1_mask); 5747 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5748 vect_shift[k] = data_ref; 5749 } 5750 5751 for (k = 0; k < 3; k++) 5752 { 5753 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2"); 5754 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 5755 vect_shift[(4 - k) % 3], 5756 vect_shift[(3 - k) % 3], 5757 shift2_mask); 5758 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5759 vect[k] = data_ref; 5760 } 5761 5762 (*result_chain)[3 - (nelt % 3)] = vect[2]; 5763 5764 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3"); 5765 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0], 5766 vect[0], shift3_mask); 5767 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5768 (*result_chain)[nelt % 3] = data_ref; 5769 5770 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4"); 5771 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1], 5772 vect[1], shift4_mask); 5773 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5774 (*result_chain)[0] = data_ref; 5775 return true; 5776 } 5777 return false; 5778 } 5779 5780 /* Function vect_transform_grouped_load. 5781 5782 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements 5783 to perform their permutation and ascribe the result vectorized statements to 5784 the scalar statements. 5785 */ 5786 5787 void 5788 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size, 5789 gimple_stmt_iterator *gsi) 5790 { 5791 machine_mode mode; 5792 vec<tree> result_chain = vNULL; 5793 5794 /* DR_CHAIN contains input data-refs that are a part of the interleaving. 5795 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted 5796 vectors, that are ready for vector computation. */ 5797 result_chain.create (size); 5798 5799 /* If reassociation width for vector type is 2 or greater target machine can 5800 execute 2 or more vector instructions in parallel. Otherwise try to 5801 get chain for loads group using vect_shift_permute_load_chain. */ 5802 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt))); 5803 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1 5804 || exact_log2 (size) != -1 5805 || !vect_shift_permute_load_chain (dr_chain, size, stmt, 5806 gsi, &result_chain)) 5807 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain); 5808 vect_record_grouped_load_vectors (stmt, result_chain); 5809 result_chain.release (); 5810 } 5811 5812 /* RESULT_CHAIN contains the output of a group of grouped loads that were 5813 generated as part of the vectorization of STMT. Assign the statement 5814 for each vector to the associated scalar statement. */ 5815 5816 void 5817 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain) 5818 { 5819 gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)); 5820 gimple *next_stmt, *new_stmt; 5821 unsigned int i, gap_count; 5822 tree tmp_data_ref; 5823 5824 /* Put a permuted data-ref in the VECTORIZED_STMT field. 5825 Since we scan the chain starting from it's first node, their order 5826 corresponds the order of data-refs in RESULT_CHAIN. */ 5827 next_stmt = first_stmt; 5828 gap_count = 1; 5829 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref) 5830 { 5831 if (!next_stmt) 5832 break; 5833 5834 /* Skip the gaps. Loads created for the gaps will be removed by dead 5835 code elimination pass later. No need to check for the first stmt in 5836 the group, since it always exists. 5837 GROUP_GAP is the number of steps in elements from the previous 5838 access (if there is no gap GROUP_GAP is 1). We skip loads that 5839 correspond to the gaps. */ 5840 if (next_stmt != first_stmt 5841 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt))) 5842 { 5843 gap_count++; 5844 continue; 5845 } 5846 5847 while (next_stmt) 5848 { 5849 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref); 5850 /* We assume that if VEC_STMT is not NULL, this is a case of multiple 5851 copies, and we put the new vector statement in the first available 5852 RELATED_STMT. */ 5853 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt))) 5854 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt; 5855 else 5856 { 5857 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt))) 5858 { 5859 gimple *prev_stmt = 5860 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)); 5861 gimple *rel_stmt = 5862 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)); 5863 while (rel_stmt) 5864 { 5865 prev_stmt = rel_stmt; 5866 rel_stmt = 5867 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt)); 5868 } 5869 5870 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) = 5871 new_stmt; 5872 } 5873 } 5874 5875 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt)); 5876 gap_count = 1; 5877 /* If NEXT_STMT accesses the same DR as the previous statement, 5878 put the same TMP_DATA_REF as its vectorized statement; otherwise 5879 get the next data-ref from RESULT_CHAIN. */ 5880 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt))) 5881 break; 5882 } 5883 } 5884 } 5885 5886 /* Function vect_force_dr_alignment_p. 5887 5888 Returns whether the alignment of a DECL can be forced to be aligned 5889 on ALIGNMENT bit boundary. */ 5890 5891 bool 5892 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment) 5893 { 5894 if (TREE_CODE (decl) != VAR_DECL) 5895 return false; 5896 5897 if (decl_in_symtab_p (decl) 5898 && !symtab_node::get (decl)->can_increase_alignment_p ()) 5899 return false; 5900 5901 if (TREE_STATIC (decl)) 5902 return (alignment <= MAX_OFILE_ALIGNMENT); 5903 else 5904 return (alignment <= MAX_STACK_ALIGNMENT); 5905 } 5906 5907 5908 /* Return whether the data reference DR is supported with respect to its 5909 alignment. 5910 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even 5911 it is aligned, i.e., check if it is possible to vectorize it with different 5912 alignment. */ 5913 5914 enum dr_alignment_support 5915 vect_supportable_dr_alignment (struct data_reference *dr, 5916 bool check_aligned_accesses) 5917 { 5918 gimple *stmt = DR_STMT (dr); 5919 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 5920 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 5921 machine_mode mode = TYPE_MODE (vectype); 5922 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 5923 struct loop *vect_loop = NULL; 5924 bool nested_in_vect_loop = false; 5925 5926 if (aligned_access_p (dr) && !check_aligned_accesses) 5927 return dr_aligned; 5928 5929 /* For now assume all conditional loads/stores support unaligned 5930 access without any special code. */ 5931 if (is_gimple_call (stmt) 5932 && gimple_call_internal_p (stmt) 5933 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD 5934 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)) 5935 return dr_unaligned_supported; 5936 5937 if (loop_vinfo) 5938 { 5939 vect_loop = LOOP_VINFO_LOOP (loop_vinfo); 5940 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt); 5941 } 5942 5943 /* Possibly unaligned access. */ 5944 5945 /* We can choose between using the implicit realignment scheme (generating 5946 a misaligned_move stmt) and the explicit realignment scheme (generating 5947 aligned loads with a REALIGN_LOAD). There are two variants to the 5948 explicit realignment scheme: optimized, and unoptimized. 5949 We can optimize the realignment only if the step between consecutive 5950 vector loads is equal to the vector size. Since the vector memory 5951 accesses advance in steps of VS (Vector Size) in the vectorized loop, it 5952 is guaranteed that the misalignment amount remains the same throughout the 5953 execution of the vectorized loop. Therefore, we can create the 5954 "realignment token" (the permutation mask that is passed to REALIGN_LOAD) 5955 at the loop preheader. 5956 5957 However, in the case of outer-loop vectorization, when vectorizing a 5958 memory access in the inner-loop nested within the LOOP that is now being 5959 vectorized, while it is guaranteed that the misalignment of the 5960 vectorized memory access will remain the same in different outer-loop 5961 iterations, it is *not* guaranteed that is will remain the same throughout 5962 the execution of the inner-loop. This is because the inner-loop advances 5963 with the original scalar step (and not in steps of VS). If the inner-loop 5964 step happens to be a multiple of VS, then the misalignment remains fixed 5965 and we can use the optimized realignment scheme. For example: 5966 5967 for (i=0; i<N; i++) 5968 for (j=0; j<M; j++) 5969 s += a[i+j]; 5970 5971 When vectorizing the i-loop in the above example, the step between 5972 consecutive vector loads is 1, and so the misalignment does not remain 5973 fixed across the execution of the inner-loop, and the realignment cannot 5974 be optimized (as illustrated in the following pseudo vectorized loop): 5975 5976 for (i=0; i<N; i+=4) 5977 for (j=0; j<M; j++){ 5978 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...} 5979 // when j is {0,1,2,3,4,5,6,7,...} respectively. 5980 // (assuming that we start from an aligned address). 5981 } 5982 5983 We therefore have to use the unoptimized realignment scheme: 5984 5985 for (i=0; i<N; i+=4) 5986 for (j=k; j<M; j+=4) 5987 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming 5988 // that the misalignment of the initial address is 5989 // 0). 5990 5991 The loop can then be vectorized as follows: 5992 5993 for (k=0; k<4; k++){ 5994 rt = get_realignment_token (&vp[k]); 5995 for (i=0; i<N; i+=4){ 5996 v1 = vp[i+k]; 5997 for (j=k; j<M; j+=4){ 5998 v2 = vp[i+j+VS-1]; 5999 va = REALIGN_LOAD <v1,v2,rt>; 6000 vs += va; 6001 v1 = v2; 6002 } 6003 } 6004 } */ 6005 6006 if (DR_IS_READ (dr)) 6007 { 6008 bool is_packed = false; 6009 tree type = (TREE_TYPE (DR_REF (dr))); 6010 6011 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing 6012 && (!targetm.vectorize.builtin_mask_for_load 6013 || targetm.vectorize.builtin_mask_for_load ())) 6014 { 6015 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 6016 6017 /* If we are doing SLP then the accesses need not have the 6018 same alignment, instead it depends on the SLP group size. */ 6019 if (loop_vinfo 6020 && STMT_SLP_TYPE (stmt_info) 6021 && (LOOP_VINFO_VECT_FACTOR (loop_vinfo) 6022 * GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info))) 6023 % TYPE_VECTOR_SUBPARTS (vectype) != 0)) 6024 ; 6025 else if (!loop_vinfo 6026 || (nested_in_vect_loop 6027 && (TREE_INT_CST_LOW (DR_STEP (dr)) 6028 != GET_MODE_SIZE (TYPE_MODE (vectype))))) 6029 return dr_explicit_realign; 6030 else 6031 return dr_explicit_realign_optimized; 6032 } 6033 if (!known_alignment_for_access_p (dr)) 6034 is_packed = not_size_aligned (DR_REF (dr)); 6035 6036 if (targetm.vectorize.support_vector_misalignment 6037 (mode, type, DR_MISALIGNMENT (dr), is_packed)) 6038 /* Can't software pipeline the loads, but can at least do them. */ 6039 return dr_unaligned_supported; 6040 } 6041 else 6042 { 6043 bool is_packed = false; 6044 tree type = (TREE_TYPE (DR_REF (dr))); 6045 6046 if (!known_alignment_for_access_p (dr)) 6047 is_packed = not_size_aligned (DR_REF (dr)); 6048 6049 if (targetm.vectorize.support_vector_misalignment 6050 (mode, type, DR_MISALIGNMENT (dr), is_packed)) 6051 return dr_unaligned_supported; 6052 } 6053 6054 /* Unsupported. */ 6055 return dr_unaligned_unsupported; 6056 } 6057