1 /* Loop autoparallelization. 2 Copyright (C) 2006-2017 Free Software Foundation, Inc. 3 Contributed by Sebastian Pop <pop@cri.ensmp.fr> 4 Zdenek Dvorak <dvorakz@suse.cz> and Razya Ladelsky <razya@il.ibm.com>. 5 6 This file is part of GCC. 7 8 GCC is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free 10 Software Foundation; either version 3, or (at your option) any later 11 version. 12 13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or 15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 16 for more details. 17 18 You should have received a copy of the GNU General Public License 19 along with GCC; see the file COPYING3. If not see 20 <http://www.gnu.org/licenses/>. */ 21 22 #include "config.h" 23 #include "system.h" 24 #include "coretypes.h" 25 #include "backend.h" 26 #include "tree.h" 27 #include "gimple.h" 28 #include "cfghooks.h" 29 #include "tree-pass.h" 30 #include "ssa.h" 31 #include "cgraph.h" 32 #include "gimple-pretty-print.h" 33 #include "fold-const.h" 34 #include "gimplify.h" 35 #include "gimple-iterator.h" 36 #include "gimplify-me.h" 37 #include "gimple-walk.h" 38 #include "stor-layout.h" 39 #include "tree-nested.h" 40 #include "tree-cfg.h" 41 #include "tree-ssa-loop-ivopts.h" 42 #include "tree-ssa-loop-manip.h" 43 #include "tree-ssa-loop-niter.h" 44 #include "tree-ssa-loop.h" 45 #include "tree-into-ssa.h" 46 #include "cfgloop.h" 47 #include "tree-scalar-evolution.h" 48 #include "langhooks.h" 49 #include "tree-vectorizer.h" 50 #include "tree-hasher.h" 51 #include "tree-parloops.h" 52 #include "omp-general.h" 53 #include "omp-low.h" 54 #include "tree-ssa.h" 55 #include "params.h" 56 #include "params-enum.h" 57 #include "tree-ssa-alias.h" 58 #include "tree-eh.h" 59 #include "gomp-constants.h" 60 #include "tree-dfa.h" 61 62 /* This pass tries to distribute iterations of loops into several threads. 63 The implementation is straightforward -- for each loop we test whether its 64 iterations are independent, and if it is the case (and some additional 65 conditions regarding profitability and correctness are satisfied), we 66 add GIMPLE_OMP_PARALLEL and GIMPLE_OMP_FOR codes and let omp expansion 67 machinery do its job. 68 69 The most of the complexity is in bringing the code into shape expected 70 by the omp expanders: 71 -- for GIMPLE_OMP_FOR, ensuring that the loop has only one induction 72 variable and that the exit test is at the start of the loop body 73 -- for GIMPLE_OMP_PARALLEL, replacing the references to local addressable 74 variables by accesses through pointers, and breaking up ssa chains 75 by storing the values incoming to the parallelized loop to a structure 76 passed to the new function as an argument (something similar is done 77 in omp gimplification, unfortunately only a small part of the code 78 can be shared). 79 80 TODO: 81 -- if there are several parallelizable loops in a function, it may be 82 possible to generate the threads just once (using synchronization to 83 ensure that cross-loop dependences are obeyed). 84 -- handling of common reduction patterns for outer loops. 85 86 More info can also be found at http://gcc.gnu.org/wiki/AutoParInGCC */ 87 /* 88 Reduction handling: 89 currently we use vect_force_simple_reduction() to detect reduction patterns. 90 The code transformation will be introduced by an example. 91 92 93 parloop 94 { 95 int sum=1; 96 97 for (i = 0; i < N; i++) 98 { 99 x[i] = i + 3; 100 sum+=x[i]; 101 } 102 } 103 104 gimple-like code: 105 header_bb: 106 107 # sum_29 = PHI <sum_11(5), 1(3)> 108 # i_28 = PHI <i_12(5), 0(3)> 109 D.1795_8 = i_28 + 3; 110 x[i_28] = D.1795_8; 111 sum_11 = D.1795_8 + sum_29; 112 i_12 = i_28 + 1; 113 if (N_6(D) > i_12) 114 goto header_bb; 115 116 117 exit_bb: 118 119 # sum_21 = PHI <sum_11(4)> 120 printf (&"%d"[0], sum_21); 121 122 123 after reduction transformation (only relevant parts): 124 125 parloop 126 { 127 128 .... 129 130 131 # Storing the initial value given by the user. # 132 133 .paral_data_store.32.sum.27 = 1; 134 135 #pragma omp parallel num_threads(4) 136 137 #pragma omp for schedule(static) 138 139 # The neutral element corresponding to the particular 140 reduction's operation, e.g. 0 for PLUS_EXPR, 141 1 for MULT_EXPR, etc. replaces the user's initial value. # 142 143 # sum.27_29 = PHI <sum.27_11, 0> 144 145 sum.27_11 = D.1827_8 + sum.27_29; 146 147 GIMPLE_OMP_CONTINUE 148 149 # Adding this reduction phi is done at create_phi_for_local_result() # 150 # sum.27_56 = PHI <sum.27_11, 0> 151 GIMPLE_OMP_RETURN 152 153 # Creating the atomic operation is done at 154 create_call_for_reduction_1() # 155 156 #pragma omp atomic_load 157 D.1839_59 = *&.paral_data_load.33_51->reduction.23; 158 D.1840_60 = sum.27_56 + D.1839_59; 159 #pragma omp atomic_store (D.1840_60); 160 161 GIMPLE_OMP_RETURN 162 163 # collecting the result after the join of the threads is done at 164 create_loads_for_reductions(). 165 The value computed by the threads is loaded from the 166 shared struct. # 167 168 169 .paral_data_load.33_52 = &.paral_data_store.32; 170 sum_37 = .paral_data_load.33_52->sum.27; 171 sum_43 = D.1795_41 + sum_37; 172 173 exit bb: 174 # sum_21 = PHI <sum_43, sum_26> 175 printf (&"%d"[0], sum_21); 176 177 ... 178 179 } 180 181 */ 182 183 /* Minimal number of iterations of a loop that should be executed in each 184 thread. */ 185 #define MIN_PER_THREAD 100 186 187 /* Element of the hashtable, representing a 188 reduction in the current loop. */ 189 struct reduction_info 190 { 191 gimple *reduc_stmt; /* reduction statement. */ 192 gimple *reduc_phi; /* The phi node defining the reduction. */ 193 enum tree_code reduction_code;/* code for the reduction operation. */ 194 unsigned reduc_version; /* SSA_NAME_VERSION of original reduc_phi 195 result. */ 196 gphi *keep_res; /* The PHI_RESULT of this phi is the resulting value 197 of the reduction variable when existing the loop. */ 198 tree initial_value; /* The initial value of the reduction var before entering the loop. */ 199 tree field; /* the name of the field in the parloop data structure intended for reduction. */ 200 tree reduc_addr; /* The address of the reduction variable for 201 openacc reductions. */ 202 tree init; /* reduction initialization value. */ 203 gphi *new_phi; /* (helper field) Newly created phi node whose result 204 will be passed to the atomic operation. Represents 205 the local result each thread computed for the reduction 206 operation. */ 207 }; 208 209 /* Reduction info hashtable helpers. */ 210 211 struct reduction_hasher : free_ptr_hash <reduction_info> 212 { 213 static inline hashval_t hash (const reduction_info *); 214 static inline bool equal (const reduction_info *, const reduction_info *); 215 }; 216 217 /* Equality and hash functions for hashtab code. */ 218 219 inline bool 220 reduction_hasher::equal (const reduction_info *a, const reduction_info *b) 221 { 222 return (a->reduc_phi == b->reduc_phi); 223 } 224 225 inline hashval_t 226 reduction_hasher::hash (const reduction_info *a) 227 { 228 return a->reduc_version; 229 } 230 231 typedef hash_table<reduction_hasher> reduction_info_table_type; 232 233 234 static struct reduction_info * 235 reduction_phi (reduction_info_table_type *reduction_list, gimple *phi) 236 { 237 struct reduction_info tmpred, *red; 238 239 if (reduction_list->elements () == 0 || phi == NULL) 240 return NULL; 241 242 if (gimple_uid (phi) == (unsigned int)-1 243 || gimple_uid (phi) == 0) 244 return NULL; 245 246 tmpred.reduc_phi = phi; 247 tmpred.reduc_version = gimple_uid (phi); 248 red = reduction_list->find (&tmpred); 249 gcc_assert (red == NULL || red->reduc_phi == phi); 250 251 return red; 252 } 253 254 /* Element of hashtable of names to copy. */ 255 256 struct name_to_copy_elt 257 { 258 unsigned version; /* The version of the name to copy. */ 259 tree new_name; /* The new name used in the copy. */ 260 tree field; /* The field of the structure used to pass the 261 value. */ 262 }; 263 264 /* Name copies hashtable helpers. */ 265 266 struct name_to_copy_hasher : free_ptr_hash <name_to_copy_elt> 267 { 268 static inline hashval_t hash (const name_to_copy_elt *); 269 static inline bool equal (const name_to_copy_elt *, const name_to_copy_elt *); 270 }; 271 272 /* Equality and hash functions for hashtab code. */ 273 274 inline bool 275 name_to_copy_hasher::equal (const name_to_copy_elt *a, const name_to_copy_elt *b) 276 { 277 return a->version == b->version; 278 } 279 280 inline hashval_t 281 name_to_copy_hasher::hash (const name_to_copy_elt *a) 282 { 283 return (hashval_t) a->version; 284 } 285 286 typedef hash_table<name_to_copy_hasher> name_to_copy_table_type; 287 288 /* A transformation matrix, which is a self-contained ROWSIZE x COLSIZE 289 matrix. Rather than use floats, we simply keep a single DENOMINATOR that 290 represents the denominator for every element in the matrix. */ 291 typedef struct lambda_trans_matrix_s 292 { 293 lambda_matrix matrix; 294 int rowsize; 295 int colsize; 296 int denominator; 297 } *lambda_trans_matrix; 298 #define LTM_MATRIX(T) ((T)->matrix) 299 #define LTM_ROWSIZE(T) ((T)->rowsize) 300 #define LTM_COLSIZE(T) ((T)->colsize) 301 #define LTM_DENOMINATOR(T) ((T)->denominator) 302 303 /* Allocate a new transformation matrix. */ 304 305 static lambda_trans_matrix 306 lambda_trans_matrix_new (int colsize, int rowsize, 307 struct obstack * lambda_obstack) 308 { 309 lambda_trans_matrix ret; 310 311 ret = (lambda_trans_matrix) 312 obstack_alloc (lambda_obstack, sizeof (struct lambda_trans_matrix_s)); 313 LTM_MATRIX (ret) = lambda_matrix_new (rowsize, colsize, lambda_obstack); 314 LTM_ROWSIZE (ret) = rowsize; 315 LTM_COLSIZE (ret) = colsize; 316 LTM_DENOMINATOR (ret) = 1; 317 return ret; 318 } 319 320 /* Multiply a vector VEC by a matrix MAT. 321 MAT is an M*N matrix, and VEC is a vector with length N. The result 322 is stored in DEST which must be a vector of length M. */ 323 324 static void 325 lambda_matrix_vector_mult (lambda_matrix matrix, int m, int n, 326 lambda_vector vec, lambda_vector dest) 327 { 328 int i, j; 329 330 lambda_vector_clear (dest, m); 331 for (i = 0; i < m; i++) 332 for (j = 0; j < n; j++) 333 dest[i] += matrix[i][j] * vec[j]; 334 } 335 336 /* Return true if TRANS is a legal transformation matrix that respects 337 the dependence vectors in DISTS and DIRS. The conservative answer 338 is false. 339 340 "Wolfe proves that a unimodular transformation represented by the 341 matrix T is legal when applied to a loop nest with a set of 342 lexicographically non-negative distance vectors RDG if and only if 343 for each vector d in RDG, (T.d >= 0) is lexicographically positive. 344 i.e.: if and only if it transforms the lexicographically positive 345 distance vectors to lexicographically positive vectors. Note that 346 a unimodular matrix must transform the zero vector (and only it) to 347 the zero vector." S.Muchnick. */ 348 349 static bool 350 lambda_transform_legal_p (lambda_trans_matrix trans, 351 int nb_loops, 352 vec<ddr_p> dependence_relations) 353 { 354 unsigned int i, j; 355 lambda_vector distres; 356 struct data_dependence_relation *ddr; 357 358 gcc_assert (LTM_COLSIZE (trans) == nb_loops 359 && LTM_ROWSIZE (trans) == nb_loops); 360 361 /* When there are no dependences, the transformation is correct. */ 362 if (dependence_relations.length () == 0) 363 return true; 364 365 ddr = dependence_relations[0]; 366 if (ddr == NULL) 367 return true; 368 369 /* When there is an unknown relation in the dependence_relations, we 370 know that it is no worth looking at this loop nest: give up. */ 371 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know) 372 return false; 373 374 distres = lambda_vector_new (nb_loops); 375 376 /* For each distance vector in the dependence graph. */ 377 FOR_EACH_VEC_ELT (dependence_relations, i, ddr) 378 { 379 /* Don't care about relations for which we know that there is no 380 dependence, nor about read-read (aka. output-dependences): 381 these data accesses can happen in any order. */ 382 if (DDR_ARE_DEPENDENT (ddr) == chrec_known 383 || (DR_IS_READ (DDR_A (ddr)) && DR_IS_READ (DDR_B (ddr)))) 384 continue; 385 386 /* Conservatively answer: "this transformation is not valid". */ 387 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know) 388 return false; 389 390 /* If the dependence could not be captured by a distance vector, 391 conservatively answer that the transform is not valid. */ 392 if (DDR_NUM_DIST_VECTS (ddr) == 0) 393 return false; 394 395 /* Compute trans.dist_vect */ 396 for (j = 0; j < DDR_NUM_DIST_VECTS (ddr); j++) 397 { 398 lambda_matrix_vector_mult (LTM_MATRIX (trans), nb_loops, nb_loops, 399 DDR_DIST_VECT (ddr, j), distres); 400 401 if (!lambda_vector_lexico_pos (distres, nb_loops)) 402 return false; 403 } 404 } 405 return true; 406 } 407 408 /* Data dependency analysis. Returns true if the iterations of LOOP 409 are independent on each other (that is, if we can execute them 410 in parallel). */ 411 412 static bool 413 loop_parallel_p (struct loop *loop, struct obstack * parloop_obstack) 414 { 415 vec<ddr_p> dependence_relations; 416 vec<data_reference_p> datarefs; 417 lambda_trans_matrix trans; 418 bool ret = false; 419 420 if (dump_file && (dump_flags & TDF_DETAILS)) 421 { 422 fprintf (dump_file, "Considering loop %d\n", loop->num); 423 if (!loop->inner) 424 fprintf (dump_file, "loop is innermost\n"); 425 else 426 fprintf (dump_file, "loop NOT innermost\n"); 427 } 428 429 /* Check for problems with dependences. If the loop can be reversed, 430 the iterations are independent. */ 431 auto_vec<loop_p, 3> loop_nest; 432 datarefs.create (10); 433 dependence_relations.create (100); 434 if (! compute_data_dependences_for_loop (loop, true, &loop_nest, &datarefs, 435 &dependence_relations)) 436 { 437 if (dump_file && (dump_flags & TDF_DETAILS)) 438 fprintf (dump_file, " FAILED: cannot analyze data dependencies\n"); 439 ret = false; 440 goto end; 441 } 442 if (dump_file && (dump_flags & TDF_DETAILS)) 443 dump_data_dependence_relations (dump_file, dependence_relations); 444 445 trans = lambda_trans_matrix_new (1, 1, parloop_obstack); 446 LTM_MATRIX (trans)[0][0] = -1; 447 448 if (lambda_transform_legal_p (trans, 1, dependence_relations)) 449 { 450 ret = true; 451 if (dump_file && (dump_flags & TDF_DETAILS)) 452 fprintf (dump_file, " SUCCESS: may be parallelized\n"); 453 } 454 else if (dump_file && (dump_flags & TDF_DETAILS)) 455 fprintf (dump_file, 456 " FAILED: data dependencies exist across iterations\n"); 457 458 end: 459 free_dependence_relations (dependence_relations); 460 free_data_refs (datarefs); 461 462 return ret; 463 } 464 465 /* Return true when LOOP contains basic blocks marked with the 466 BB_IRREDUCIBLE_LOOP flag. */ 467 468 static inline bool 469 loop_has_blocks_with_irreducible_flag (struct loop *loop) 470 { 471 unsigned i; 472 basic_block *bbs = get_loop_body_in_dom_order (loop); 473 bool res = true; 474 475 for (i = 0; i < loop->num_nodes; i++) 476 if (bbs[i]->flags & BB_IRREDUCIBLE_LOOP) 477 goto end; 478 479 res = false; 480 end: 481 free (bbs); 482 return res; 483 } 484 485 /* Assigns the address of OBJ in TYPE to an ssa name, and returns this name. 486 The assignment statement is placed on edge ENTRY. DECL_ADDRESS maps decls 487 to their addresses that can be reused. The address of OBJ is known to 488 be invariant in the whole function. Other needed statements are placed 489 right before GSI. */ 490 491 static tree 492 take_address_of (tree obj, tree type, edge entry, 493 int_tree_htab_type *decl_address, gimple_stmt_iterator *gsi) 494 { 495 int uid; 496 tree *var_p, name, addr; 497 gassign *stmt; 498 gimple_seq stmts; 499 500 /* Since the address of OBJ is invariant, the trees may be shared. 501 Avoid rewriting unrelated parts of the code. */ 502 obj = unshare_expr (obj); 503 for (var_p = &obj; 504 handled_component_p (*var_p); 505 var_p = &TREE_OPERAND (*var_p, 0)) 506 continue; 507 508 /* Canonicalize the access to base on a MEM_REF. */ 509 if (DECL_P (*var_p)) 510 *var_p = build_simple_mem_ref (build_fold_addr_expr (*var_p)); 511 512 /* Assign a canonical SSA name to the address of the base decl used 513 in the address and share it for all accesses and addresses based 514 on it. */ 515 uid = DECL_UID (TREE_OPERAND (TREE_OPERAND (*var_p, 0), 0)); 516 int_tree_map elt; 517 elt.uid = uid; 518 int_tree_map *slot = decl_address->find_slot (elt, INSERT); 519 if (!slot->to) 520 { 521 if (gsi == NULL) 522 return NULL; 523 addr = TREE_OPERAND (*var_p, 0); 524 const char *obj_name 525 = get_name (TREE_OPERAND (TREE_OPERAND (*var_p, 0), 0)); 526 if (obj_name) 527 name = make_temp_ssa_name (TREE_TYPE (addr), NULL, obj_name); 528 else 529 name = make_ssa_name (TREE_TYPE (addr)); 530 stmt = gimple_build_assign (name, addr); 531 gsi_insert_on_edge_immediate (entry, stmt); 532 533 slot->uid = uid; 534 slot->to = name; 535 } 536 else 537 name = slot->to; 538 539 /* Express the address in terms of the canonical SSA name. */ 540 TREE_OPERAND (*var_p, 0) = name; 541 if (gsi == NULL) 542 return build_fold_addr_expr_with_type (obj, type); 543 544 name = force_gimple_operand (build_addr (obj), 545 &stmts, true, NULL_TREE); 546 if (!gimple_seq_empty_p (stmts)) 547 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); 548 549 if (!useless_type_conversion_p (type, TREE_TYPE (name))) 550 { 551 name = force_gimple_operand (fold_convert (type, name), &stmts, true, 552 NULL_TREE); 553 if (!gimple_seq_empty_p (stmts)) 554 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); 555 } 556 557 return name; 558 } 559 560 static tree 561 reduc_stmt_res (gimple *stmt) 562 { 563 return (gimple_code (stmt) == GIMPLE_PHI 564 ? gimple_phi_result (stmt) 565 : gimple_assign_lhs (stmt)); 566 } 567 568 /* Callback for htab_traverse. Create the initialization statement 569 for reduction described in SLOT, and place it at the preheader of 570 the loop described in DATA. */ 571 572 int 573 initialize_reductions (reduction_info **slot, struct loop *loop) 574 { 575 tree init; 576 tree type, arg; 577 edge e; 578 579 struct reduction_info *const reduc = *slot; 580 581 /* Create initialization in preheader: 582 reduction_variable = initialization value of reduction. */ 583 584 /* In the phi node at the header, replace the argument coming 585 from the preheader with the reduction initialization value. */ 586 587 /* Initialize the reduction. */ 588 type = TREE_TYPE (PHI_RESULT (reduc->reduc_phi)); 589 init = omp_reduction_init_op (gimple_location (reduc->reduc_stmt), 590 reduc->reduction_code, type); 591 reduc->init = init; 592 593 /* Replace the argument representing the initialization value 594 with the initialization value for the reduction (neutral 595 element for the particular operation, e.g. 0 for PLUS_EXPR, 596 1 for MULT_EXPR, etc). 597 Keep the old value in a new variable "reduction_initial", 598 that will be taken in consideration after the parallel 599 computing is done. */ 600 601 e = loop_preheader_edge (loop); 602 arg = PHI_ARG_DEF_FROM_EDGE (reduc->reduc_phi, e); 603 /* Create new variable to hold the initial value. */ 604 605 SET_USE (PHI_ARG_DEF_PTR_FROM_EDGE 606 (reduc->reduc_phi, loop_preheader_edge (loop)), init); 607 reduc->initial_value = arg; 608 return 1; 609 } 610 611 struct elv_data 612 { 613 struct walk_stmt_info info; 614 edge entry; 615 int_tree_htab_type *decl_address; 616 gimple_stmt_iterator *gsi; 617 bool changed; 618 bool reset; 619 }; 620 621 /* Eliminates references to local variables in *TP out of the single 622 entry single exit region starting at DTA->ENTRY. 623 DECL_ADDRESS contains addresses of the references that had their 624 address taken already. If the expression is changed, CHANGED is 625 set to true. Callback for walk_tree. */ 626 627 static tree 628 eliminate_local_variables_1 (tree *tp, int *walk_subtrees, void *data) 629 { 630 struct elv_data *const dta = (struct elv_data *) data; 631 tree t = *tp, var, addr, addr_type, type, obj; 632 633 if (DECL_P (t)) 634 { 635 *walk_subtrees = 0; 636 637 if (!SSA_VAR_P (t) || DECL_EXTERNAL (t)) 638 return NULL_TREE; 639 640 type = TREE_TYPE (t); 641 addr_type = build_pointer_type (type); 642 addr = take_address_of (t, addr_type, dta->entry, dta->decl_address, 643 dta->gsi); 644 if (dta->gsi == NULL && addr == NULL_TREE) 645 { 646 dta->reset = true; 647 return NULL_TREE; 648 } 649 650 *tp = build_simple_mem_ref (addr); 651 652 dta->changed = true; 653 return NULL_TREE; 654 } 655 656 if (TREE_CODE (t) == ADDR_EXPR) 657 { 658 /* ADDR_EXPR may appear in two contexts: 659 -- as a gimple operand, when the address taken is a function invariant 660 -- as gimple rhs, when the resulting address in not a function 661 invariant 662 We do not need to do anything special in the latter case (the base of 663 the memory reference whose address is taken may be replaced in the 664 DECL_P case). The former case is more complicated, as we need to 665 ensure that the new address is still a gimple operand. Thus, it 666 is not sufficient to replace just the base of the memory reference -- 667 we need to move the whole computation of the address out of the 668 loop. */ 669 if (!is_gimple_val (t)) 670 return NULL_TREE; 671 672 *walk_subtrees = 0; 673 obj = TREE_OPERAND (t, 0); 674 var = get_base_address (obj); 675 if (!var || !SSA_VAR_P (var) || DECL_EXTERNAL (var)) 676 return NULL_TREE; 677 678 addr_type = TREE_TYPE (t); 679 addr = take_address_of (obj, addr_type, dta->entry, dta->decl_address, 680 dta->gsi); 681 if (dta->gsi == NULL && addr == NULL_TREE) 682 { 683 dta->reset = true; 684 return NULL_TREE; 685 } 686 *tp = addr; 687 688 dta->changed = true; 689 return NULL_TREE; 690 } 691 692 if (!EXPR_P (t)) 693 *walk_subtrees = 0; 694 695 return NULL_TREE; 696 } 697 698 /* Moves the references to local variables in STMT at *GSI out of the single 699 entry single exit region starting at ENTRY. DECL_ADDRESS contains 700 addresses of the references that had their address taken 701 already. */ 702 703 static void 704 eliminate_local_variables_stmt (edge entry, gimple_stmt_iterator *gsi, 705 int_tree_htab_type *decl_address) 706 { 707 struct elv_data dta; 708 gimple *stmt = gsi_stmt (*gsi); 709 710 memset (&dta.info, '\0', sizeof (dta.info)); 711 dta.entry = entry; 712 dta.decl_address = decl_address; 713 dta.changed = false; 714 dta.reset = false; 715 716 if (gimple_debug_bind_p (stmt)) 717 { 718 dta.gsi = NULL; 719 walk_tree (gimple_debug_bind_get_value_ptr (stmt), 720 eliminate_local_variables_1, &dta.info, NULL); 721 if (dta.reset) 722 { 723 gimple_debug_bind_reset_value (stmt); 724 dta.changed = true; 725 } 726 } 727 else if (gimple_clobber_p (stmt)) 728 { 729 unlink_stmt_vdef (stmt); 730 stmt = gimple_build_nop (); 731 gsi_replace (gsi, stmt, false); 732 dta.changed = true; 733 } 734 else 735 { 736 dta.gsi = gsi; 737 walk_gimple_op (stmt, eliminate_local_variables_1, &dta.info); 738 } 739 740 if (dta.changed) 741 update_stmt (stmt); 742 } 743 744 /* Eliminates the references to local variables from the single entry 745 single exit region between the ENTRY and EXIT edges. 746 747 This includes: 748 1) Taking address of a local variable -- these are moved out of the 749 region (and temporary variable is created to hold the address if 750 necessary). 751 752 2) Dereferencing a local variable -- these are replaced with indirect 753 references. */ 754 755 static void 756 eliminate_local_variables (edge entry, edge exit) 757 { 758 basic_block bb; 759 auto_vec<basic_block, 3> body; 760 unsigned i; 761 gimple_stmt_iterator gsi; 762 bool has_debug_stmt = false; 763 int_tree_htab_type decl_address (10); 764 basic_block entry_bb = entry->src; 765 basic_block exit_bb = exit->dest; 766 767 gather_blocks_in_sese_region (entry_bb, exit_bb, &body); 768 769 FOR_EACH_VEC_ELT (body, i, bb) 770 if (bb != entry_bb && bb != exit_bb) 771 { 772 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi)) 773 if (is_gimple_debug (gsi_stmt (gsi))) 774 { 775 if (gimple_debug_bind_p (gsi_stmt (gsi))) 776 has_debug_stmt = true; 777 } 778 else 779 eliminate_local_variables_stmt (entry, &gsi, &decl_address); 780 } 781 782 if (has_debug_stmt) 783 FOR_EACH_VEC_ELT (body, i, bb) 784 if (bb != entry_bb && bb != exit_bb) 785 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi)) 786 if (gimple_debug_bind_p (gsi_stmt (gsi))) 787 eliminate_local_variables_stmt (entry, &gsi, &decl_address); 788 } 789 790 /* Returns true if expression EXPR is not defined between ENTRY and 791 EXIT, i.e. if all its operands are defined outside of the region. */ 792 793 static bool 794 expr_invariant_in_region_p (edge entry, edge exit, tree expr) 795 { 796 basic_block entry_bb = entry->src; 797 basic_block exit_bb = exit->dest; 798 basic_block def_bb; 799 800 if (is_gimple_min_invariant (expr)) 801 return true; 802 803 if (TREE_CODE (expr) == SSA_NAME) 804 { 805 def_bb = gimple_bb (SSA_NAME_DEF_STMT (expr)); 806 if (def_bb 807 && dominated_by_p (CDI_DOMINATORS, def_bb, entry_bb) 808 && !dominated_by_p (CDI_DOMINATORS, def_bb, exit_bb)) 809 return false; 810 811 return true; 812 } 813 814 return false; 815 } 816 817 /* If COPY_NAME_P is true, creates and returns a duplicate of NAME. 818 The copies are stored to NAME_COPIES, if NAME was already duplicated, 819 its duplicate stored in NAME_COPIES is returned. 820 821 Regardless of COPY_NAME_P, the decl used as a base of the ssa name is also 822 duplicated, storing the copies in DECL_COPIES. */ 823 824 static tree 825 separate_decls_in_region_name (tree name, name_to_copy_table_type *name_copies, 826 int_tree_htab_type *decl_copies, 827 bool copy_name_p) 828 { 829 tree copy, var, var_copy; 830 unsigned idx, uid, nuid; 831 struct int_tree_map ielt; 832 struct name_to_copy_elt elt, *nelt; 833 name_to_copy_elt **slot; 834 int_tree_map *dslot; 835 836 if (TREE_CODE (name) != SSA_NAME) 837 return name; 838 839 idx = SSA_NAME_VERSION (name); 840 elt.version = idx; 841 slot = name_copies->find_slot_with_hash (&elt, idx, 842 copy_name_p ? INSERT : NO_INSERT); 843 if (slot && *slot) 844 return (*slot)->new_name; 845 846 if (copy_name_p) 847 { 848 copy = duplicate_ssa_name (name, NULL); 849 nelt = XNEW (struct name_to_copy_elt); 850 nelt->version = idx; 851 nelt->new_name = copy; 852 nelt->field = NULL_TREE; 853 *slot = nelt; 854 } 855 else 856 { 857 gcc_assert (!slot); 858 copy = name; 859 } 860 861 var = SSA_NAME_VAR (name); 862 if (!var) 863 return copy; 864 865 uid = DECL_UID (var); 866 ielt.uid = uid; 867 dslot = decl_copies->find_slot_with_hash (ielt, uid, INSERT); 868 if (!dslot->to) 869 { 870 var_copy = create_tmp_var (TREE_TYPE (var), get_name (var)); 871 DECL_GIMPLE_REG_P (var_copy) = DECL_GIMPLE_REG_P (var); 872 dslot->uid = uid; 873 dslot->to = var_copy; 874 875 /* Ensure that when we meet this decl next time, we won't duplicate 876 it again. */ 877 nuid = DECL_UID (var_copy); 878 ielt.uid = nuid; 879 dslot = decl_copies->find_slot_with_hash (ielt, nuid, INSERT); 880 gcc_assert (!dslot->to); 881 dslot->uid = nuid; 882 dslot->to = var_copy; 883 } 884 else 885 var_copy = dslot->to; 886 887 replace_ssa_name_symbol (copy, var_copy); 888 return copy; 889 } 890 891 /* Finds the ssa names used in STMT that are defined outside the 892 region between ENTRY and EXIT and replaces such ssa names with 893 their duplicates. The duplicates are stored to NAME_COPIES. Base 894 decls of all ssa names used in STMT (including those defined in 895 LOOP) are replaced with the new temporary variables; the 896 replacement decls are stored in DECL_COPIES. */ 897 898 static void 899 separate_decls_in_region_stmt (edge entry, edge exit, gimple *stmt, 900 name_to_copy_table_type *name_copies, 901 int_tree_htab_type *decl_copies) 902 { 903 use_operand_p use; 904 def_operand_p def; 905 ssa_op_iter oi; 906 tree name, copy; 907 bool copy_name_p; 908 909 FOR_EACH_PHI_OR_STMT_DEF (def, stmt, oi, SSA_OP_DEF) 910 { 911 name = DEF_FROM_PTR (def); 912 gcc_assert (TREE_CODE (name) == SSA_NAME); 913 copy = separate_decls_in_region_name (name, name_copies, decl_copies, 914 false); 915 gcc_assert (copy == name); 916 } 917 918 FOR_EACH_PHI_OR_STMT_USE (use, stmt, oi, SSA_OP_USE) 919 { 920 name = USE_FROM_PTR (use); 921 if (TREE_CODE (name) != SSA_NAME) 922 continue; 923 924 copy_name_p = expr_invariant_in_region_p (entry, exit, name); 925 copy = separate_decls_in_region_name (name, name_copies, decl_copies, 926 copy_name_p); 927 SET_USE (use, copy); 928 } 929 } 930 931 /* Finds the ssa names used in STMT that are defined outside the 932 region between ENTRY and EXIT and replaces such ssa names with 933 their duplicates. The duplicates are stored to NAME_COPIES. Base 934 decls of all ssa names used in STMT (including those defined in 935 LOOP) are replaced with the new temporary variables; the 936 replacement decls are stored in DECL_COPIES. */ 937 938 static bool 939 separate_decls_in_region_debug (gimple *stmt, 940 name_to_copy_table_type *name_copies, 941 int_tree_htab_type *decl_copies) 942 { 943 use_operand_p use; 944 ssa_op_iter oi; 945 tree var, name; 946 struct int_tree_map ielt; 947 struct name_to_copy_elt elt; 948 name_to_copy_elt **slot; 949 int_tree_map *dslot; 950 951 if (gimple_debug_bind_p (stmt)) 952 var = gimple_debug_bind_get_var (stmt); 953 else if (gimple_debug_source_bind_p (stmt)) 954 var = gimple_debug_source_bind_get_var (stmt); 955 else 956 return true; 957 if (TREE_CODE (var) == DEBUG_EXPR_DECL || TREE_CODE (var) == LABEL_DECL) 958 return true; 959 gcc_assert (DECL_P (var) && SSA_VAR_P (var)); 960 ielt.uid = DECL_UID (var); 961 dslot = decl_copies->find_slot_with_hash (ielt, ielt.uid, NO_INSERT); 962 if (!dslot) 963 return true; 964 if (gimple_debug_bind_p (stmt)) 965 gimple_debug_bind_set_var (stmt, dslot->to); 966 else if (gimple_debug_source_bind_p (stmt)) 967 gimple_debug_source_bind_set_var (stmt, dslot->to); 968 969 FOR_EACH_PHI_OR_STMT_USE (use, stmt, oi, SSA_OP_USE) 970 { 971 name = USE_FROM_PTR (use); 972 if (TREE_CODE (name) != SSA_NAME) 973 continue; 974 975 elt.version = SSA_NAME_VERSION (name); 976 slot = name_copies->find_slot_with_hash (&elt, elt.version, NO_INSERT); 977 if (!slot) 978 { 979 gimple_debug_bind_reset_value (stmt); 980 update_stmt (stmt); 981 break; 982 } 983 984 SET_USE (use, (*slot)->new_name); 985 } 986 987 return false; 988 } 989 990 /* Callback for htab_traverse. Adds a field corresponding to the reduction 991 specified in SLOT. The type is passed in DATA. */ 992 993 int 994 add_field_for_reduction (reduction_info **slot, tree type) 995 { 996 997 struct reduction_info *const red = *slot; 998 tree var = reduc_stmt_res (red->reduc_stmt); 999 tree field = build_decl (gimple_location (red->reduc_stmt), FIELD_DECL, 1000 SSA_NAME_IDENTIFIER (var), TREE_TYPE (var)); 1001 1002 insert_field_into_struct (type, field); 1003 1004 red->field = field; 1005 1006 return 1; 1007 } 1008 1009 /* Callback for htab_traverse. Adds a field corresponding to a ssa name 1010 described in SLOT. The type is passed in DATA. */ 1011 1012 int 1013 add_field_for_name (name_to_copy_elt **slot, tree type) 1014 { 1015 struct name_to_copy_elt *const elt = *slot; 1016 tree name = ssa_name (elt->version); 1017 tree field = build_decl (UNKNOWN_LOCATION, 1018 FIELD_DECL, SSA_NAME_IDENTIFIER (name), 1019 TREE_TYPE (name)); 1020 1021 insert_field_into_struct (type, field); 1022 elt->field = field; 1023 1024 return 1; 1025 } 1026 1027 /* Callback for htab_traverse. A local result is the intermediate result 1028 computed by a single 1029 thread, or the initial value in case no iteration was executed. 1030 This function creates a phi node reflecting these values. 1031 The phi's result will be stored in NEW_PHI field of the 1032 reduction's data structure. */ 1033 1034 int 1035 create_phi_for_local_result (reduction_info **slot, struct loop *loop) 1036 { 1037 struct reduction_info *const reduc = *slot; 1038 edge e; 1039 gphi *new_phi; 1040 basic_block store_bb, continue_bb; 1041 tree local_res; 1042 source_location locus; 1043 1044 /* STORE_BB is the block where the phi 1045 should be stored. It is the destination of the loop exit. 1046 (Find the fallthru edge from GIMPLE_OMP_CONTINUE). */ 1047 continue_bb = single_pred (loop->latch); 1048 store_bb = FALLTHRU_EDGE (continue_bb)->dest; 1049 1050 /* STORE_BB has two predecessors. One coming from the loop 1051 (the reduction's result is computed at the loop), 1052 and another coming from a block preceding the loop, 1053 when no iterations 1054 are executed (the initial value should be taken). */ 1055 if (EDGE_PRED (store_bb, 0) == FALLTHRU_EDGE (continue_bb)) 1056 e = EDGE_PRED (store_bb, 1); 1057 else 1058 e = EDGE_PRED (store_bb, 0); 1059 tree lhs = reduc_stmt_res (reduc->reduc_stmt); 1060 local_res = copy_ssa_name (lhs); 1061 locus = gimple_location (reduc->reduc_stmt); 1062 new_phi = create_phi_node (local_res, store_bb); 1063 add_phi_arg (new_phi, reduc->init, e, locus); 1064 add_phi_arg (new_phi, lhs, FALLTHRU_EDGE (continue_bb), locus); 1065 reduc->new_phi = new_phi; 1066 1067 return 1; 1068 } 1069 1070 struct clsn_data 1071 { 1072 tree store; 1073 tree load; 1074 1075 basic_block store_bb; 1076 basic_block load_bb; 1077 }; 1078 1079 /* Callback for htab_traverse. Create an atomic instruction for the 1080 reduction described in SLOT. 1081 DATA annotates the place in memory the atomic operation relates to, 1082 and the basic block it needs to be generated in. */ 1083 1084 int 1085 create_call_for_reduction_1 (reduction_info **slot, struct clsn_data *clsn_data) 1086 { 1087 struct reduction_info *const reduc = *slot; 1088 gimple_stmt_iterator gsi; 1089 tree type = TREE_TYPE (PHI_RESULT (reduc->reduc_phi)); 1090 tree load_struct; 1091 basic_block bb; 1092 basic_block new_bb; 1093 edge e; 1094 tree t, addr, ref, x; 1095 tree tmp_load, name; 1096 gimple *load; 1097 1098 if (reduc->reduc_addr == NULL_TREE) 1099 { 1100 load_struct = build_simple_mem_ref (clsn_data->load); 1101 t = build3 (COMPONENT_REF, type, load_struct, reduc->field, NULL_TREE); 1102 1103 addr = build_addr (t); 1104 } 1105 else 1106 { 1107 /* Set the address for the atomic store. */ 1108 addr = reduc->reduc_addr; 1109 1110 /* Remove the non-atomic store '*addr = sum'. */ 1111 tree res = PHI_RESULT (reduc->keep_res); 1112 use_operand_p use_p; 1113 gimple *stmt; 1114 bool single_use_p = single_imm_use (res, &use_p, &stmt); 1115 gcc_assert (single_use_p); 1116 replace_uses_by (gimple_vdef (stmt), 1117 gimple_vuse (stmt)); 1118 gimple_stmt_iterator gsi = gsi_for_stmt (stmt); 1119 gsi_remove (&gsi, true); 1120 } 1121 1122 /* Create phi node. */ 1123 bb = clsn_data->load_bb; 1124 1125 gsi = gsi_last_bb (bb); 1126 e = split_block (bb, gsi_stmt (gsi)); 1127 new_bb = e->dest; 1128 1129 tmp_load = create_tmp_var (TREE_TYPE (TREE_TYPE (addr))); 1130 tmp_load = make_ssa_name (tmp_load); 1131 load = gimple_build_omp_atomic_load (tmp_load, addr); 1132 SSA_NAME_DEF_STMT (tmp_load) = load; 1133 gsi = gsi_start_bb (new_bb); 1134 gsi_insert_after (&gsi, load, GSI_NEW_STMT); 1135 1136 e = split_block (new_bb, load); 1137 new_bb = e->dest; 1138 gsi = gsi_start_bb (new_bb); 1139 ref = tmp_load; 1140 x = fold_build2 (reduc->reduction_code, 1141 TREE_TYPE (PHI_RESULT (reduc->new_phi)), ref, 1142 PHI_RESULT (reduc->new_phi)); 1143 1144 name = force_gimple_operand_gsi (&gsi, x, true, NULL_TREE, true, 1145 GSI_CONTINUE_LINKING); 1146 1147 gsi_insert_after (&gsi, gimple_build_omp_atomic_store (name), GSI_NEW_STMT); 1148 return 1; 1149 } 1150 1151 /* Create the atomic operation at the join point of the threads. 1152 REDUCTION_LIST describes the reductions in the LOOP. 1153 LD_ST_DATA describes the shared data structure where 1154 shared data is stored in and loaded from. */ 1155 static void 1156 create_call_for_reduction (struct loop *loop, 1157 reduction_info_table_type *reduction_list, 1158 struct clsn_data *ld_st_data) 1159 { 1160 reduction_list->traverse <struct loop *, create_phi_for_local_result> (loop); 1161 /* Find the fallthru edge from GIMPLE_OMP_CONTINUE. */ 1162 basic_block continue_bb = single_pred (loop->latch); 1163 ld_st_data->load_bb = FALLTHRU_EDGE (continue_bb)->dest; 1164 reduction_list 1165 ->traverse <struct clsn_data *, create_call_for_reduction_1> (ld_st_data); 1166 } 1167 1168 /* Callback for htab_traverse. Loads the final reduction value at the 1169 join point of all threads, and inserts it in the right place. */ 1170 1171 int 1172 create_loads_for_reductions (reduction_info **slot, struct clsn_data *clsn_data) 1173 { 1174 struct reduction_info *const red = *slot; 1175 gimple *stmt; 1176 gimple_stmt_iterator gsi; 1177 tree type = TREE_TYPE (reduc_stmt_res (red->reduc_stmt)); 1178 tree load_struct; 1179 tree name; 1180 tree x; 1181 1182 /* If there's no exit phi, the result of the reduction is unused. */ 1183 if (red->keep_res == NULL) 1184 return 1; 1185 1186 gsi = gsi_after_labels (clsn_data->load_bb); 1187 load_struct = build_simple_mem_ref (clsn_data->load); 1188 load_struct = build3 (COMPONENT_REF, type, load_struct, red->field, 1189 NULL_TREE); 1190 1191 x = load_struct; 1192 name = PHI_RESULT (red->keep_res); 1193 stmt = gimple_build_assign (name, x); 1194 1195 gsi_insert_after (&gsi, stmt, GSI_NEW_STMT); 1196 1197 for (gsi = gsi_start_phis (gimple_bb (red->keep_res)); 1198 !gsi_end_p (gsi); gsi_next (&gsi)) 1199 if (gsi_stmt (gsi) == red->keep_res) 1200 { 1201 remove_phi_node (&gsi, false); 1202 return 1; 1203 } 1204 gcc_unreachable (); 1205 } 1206 1207 /* Load the reduction result that was stored in LD_ST_DATA. 1208 REDUCTION_LIST describes the list of reductions that the 1209 loads should be generated for. */ 1210 static void 1211 create_final_loads_for_reduction (reduction_info_table_type *reduction_list, 1212 struct clsn_data *ld_st_data) 1213 { 1214 gimple_stmt_iterator gsi; 1215 tree t; 1216 gimple *stmt; 1217 1218 gsi = gsi_after_labels (ld_st_data->load_bb); 1219 t = build_fold_addr_expr (ld_st_data->store); 1220 stmt = gimple_build_assign (ld_st_data->load, t); 1221 1222 gsi_insert_before (&gsi, stmt, GSI_NEW_STMT); 1223 1224 reduction_list 1225 ->traverse <struct clsn_data *, create_loads_for_reductions> (ld_st_data); 1226 1227 } 1228 1229 /* Callback for htab_traverse. Store the neutral value for the 1230 particular reduction's operation, e.g. 0 for PLUS_EXPR, 1231 1 for MULT_EXPR, etc. into the reduction field. 1232 The reduction is specified in SLOT. The store information is 1233 passed in DATA. */ 1234 1235 int 1236 create_stores_for_reduction (reduction_info **slot, struct clsn_data *clsn_data) 1237 { 1238 struct reduction_info *const red = *slot; 1239 tree t; 1240 gimple *stmt; 1241 gimple_stmt_iterator gsi; 1242 tree type = TREE_TYPE (reduc_stmt_res (red->reduc_stmt)); 1243 1244 gsi = gsi_last_bb (clsn_data->store_bb); 1245 t = build3 (COMPONENT_REF, type, clsn_data->store, red->field, NULL_TREE); 1246 stmt = gimple_build_assign (t, red->initial_value); 1247 gsi_insert_after (&gsi, stmt, GSI_NEW_STMT); 1248 1249 return 1; 1250 } 1251 1252 /* Callback for htab_traverse. Creates loads to a field of LOAD in LOAD_BB and 1253 store to a field of STORE in STORE_BB for the ssa name and its duplicate 1254 specified in SLOT. */ 1255 1256 int 1257 create_loads_and_stores_for_name (name_to_copy_elt **slot, 1258 struct clsn_data *clsn_data) 1259 { 1260 struct name_to_copy_elt *const elt = *slot; 1261 tree t; 1262 gimple *stmt; 1263 gimple_stmt_iterator gsi; 1264 tree type = TREE_TYPE (elt->new_name); 1265 tree load_struct; 1266 1267 gsi = gsi_last_bb (clsn_data->store_bb); 1268 t = build3 (COMPONENT_REF, type, clsn_data->store, elt->field, NULL_TREE); 1269 stmt = gimple_build_assign (t, ssa_name (elt->version)); 1270 gsi_insert_after (&gsi, stmt, GSI_NEW_STMT); 1271 1272 gsi = gsi_last_bb (clsn_data->load_bb); 1273 load_struct = build_simple_mem_ref (clsn_data->load); 1274 t = build3 (COMPONENT_REF, type, load_struct, elt->field, NULL_TREE); 1275 stmt = gimple_build_assign (elt->new_name, t); 1276 gsi_insert_after (&gsi, stmt, GSI_NEW_STMT); 1277 1278 return 1; 1279 } 1280 1281 /* Moves all the variables used in LOOP and defined outside of it (including 1282 the initial values of loop phi nodes, and *PER_THREAD if it is a ssa 1283 name) to a structure created for this purpose. The code 1284 1285 while (1) 1286 { 1287 use (a); 1288 use (b); 1289 } 1290 1291 is transformed this way: 1292 1293 bb0: 1294 old.a = a; 1295 old.b = b; 1296 1297 bb1: 1298 a' = new->a; 1299 b' = new->b; 1300 while (1) 1301 { 1302 use (a'); 1303 use (b'); 1304 } 1305 1306 `old' is stored to *ARG_STRUCT and `new' is stored to NEW_ARG_STRUCT. The 1307 pointer `new' is intentionally not initialized (the loop will be split to a 1308 separate function later, and `new' will be initialized from its arguments). 1309 LD_ST_DATA holds information about the shared data structure used to pass 1310 information among the threads. It is initialized here, and 1311 gen_parallel_loop will pass it to create_call_for_reduction that 1312 needs this information. REDUCTION_LIST describes the reductions 1313 in LOOP. */ 1314 1315 static void 1316 separate_decls_in_region (edge entry, edge exit, 1317 reduction_info_table_type *reduction_list, 1318 tree *arg_struct, tree *new_arg_struct, 1319 struct clsn_data *ld_st_data) 1320 1321 { 1322 basic_block bb1 = split_edge (entry); 1323 basic_block bb0 = single_pred (bb1); 1324 name_to_copy_table_type name_copies (10); 1325 int_tree_htab_type decl_copies (10); 1326 unsigned i; 1327 tree type, type_name, nvar; 1328 gimple_stmt_iterator gsi; 1329 struct clsn_data clsn_data; 1330 auto_vec<basic_block, 3> body; 1331 basic_block bb; 1332 basic_block entry_bb = bb1; 1333 basic_block exit_bb = exit->dest; 1334 bool has_debug_stmt = false; 1335 1336 entry = single_succ_edge (entry_bb); 1337 gather_blocks_in_sese_region (entry_bb, exit_bb, &body); 1338 1339 FOR_EACH_VEC_ELT (body, i, bb) 1340 { 1341 if (bb != entry_bb && bb != exit_bb) 1342 { 1343 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi)) 1344 separate_decls_in_region_stmt (entry, exit, gsi_stmt (gsi), 1345 &name_copies, &decl_copies); 1346 1347 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi)) 1348 { 1349 gimple *stmt = gsi_stmt (gsi); 1350 1351 if (is_gimple_debug (stmt)) 1352 has_debug_stmt = true; 1353 else 1354 separate_decls_in_region_stmt (entry, exit, stmt, 1355 &name_copies, &decl_copies); 1356 } 1357 } 1358 } 1359 1360 /* Now process debug bind stmts. We must not create decls while 1361 processing debug stmts, so we defer their processing so as to 1362 make sure we will have debug info for as many variables as 1363 possible (all of those that were dealt with in the loop above), 1364 and discard those for which we know there's nothing we can 1365 do. */ 1366 if (has_debug_stmt) 1367 FOR_EACH_VEC_ELT (body, i, bb) 1368 if (bb != entry_bb && bb != exit_bb) 1369 { 1370 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);) 1371 { 1372 gimple *stmt = gsi_stmt (gsi); 1373 1374 if (is_gimple_debug (stmt)) 1375 { 1376 if (separate_decls_in_region_debug (stmt, &name_copies, 1377 &decl_copies)) 1378 { 1379 gsi_remove (&gsi, true); 1380 continue; 1381 } 1382 } 1383 1384 gsi_next (&gsi); 1385 } 1386 } 1387 1388 if (name_copies.elements () == 0 && reduction_list->elements () == 0) 1389 { 1390 /* It may happen that there is nothing to copy (if there are only 1391 loop carried and external variables in the loop). */ 1392 *arg_struct = NULL; 1393 *new_arg_struct = NULL; 1394 } 1395 else 1396 { 1397 /* Create the type for the structure to store the ssa names to. */ 1398 type = lang_hooks.types.make_type (RECORD_TYPE); 1399 type_name = build_decl (UNKNOWN_LOCATION, 1400 TYPE_DECL, create_tmp_var_name (".paral_data"), 1401 type); 1402 TYPE_NAME (type) = type_name; 1403 1404 name_copies.traverse <tree, add_field_for_name> (type); 1405 if (reduction_list && reduction_list->elements () > 0) 1406 { 1407 /* Create the fields for reductions. */ 1408 reduction_list->traverse <tree, add_field_for_reduction> (type); 1409 } 1410 layout_type (type); 1411 1412 /* Create the loads and stores. */ 1413 *arg_struct = create_tmp_var (type, ".paral_data_store"); 1414 nvar = create_tmp_var (build_pointer_type (type), ".paral_data_load"); 1415 *new_arg_struct = make_ssa_name (nvar); 1416 1417 ld_st_data->store = *arg_struct; 1418 ld_st_data->load = *new_arg_struct; 1419 ld_st_data->store_bb = bb0; 1420 ld_st_data->load_bb = bb1; 1421 1422 name_copies 1423 .traverse <struct clsn_data *, create_loads_and_stores_for_name> 1424 (ld_st_data); 1425 1426 /* Load the calculation from memory (after the join of the threads). */ 1427 1428 if (reduction_list && reduction_list->elements () > 0) 1429 { 1430 reduction_list 1431 ->traverse <struct clsn_data *, create_stores_for_reduction> 1432 (ld_st_data); 1433 clsn_data.load = make_ssa_name (nvar); 1434 clsn_data.load_bb = exit->dest; 1435 clsn_data.store = ld_st_data->store; 1436 create_final_loads_for_reduction (reduction_list, &clsn_data); 1437 } 1438 } 1439 } 1440 1441 /* Returns true if FN was created to run in parallel. */ 1442 1443 bool 1444 parallelized_function_p (tree fndecl) 1445 { 1446 cgraph_node *node = cgraph_node::get (fndecl); 1447 gcc_assert (node != NULL); 1448 return node->parallelized_function; 1449 } 1450 1451 /* Creates and returns an empty function that will receive the body of 1452 a parallelized loop. */ 1453 1454 static tree 1455 create_loop_fn (location_t loc) 1456 { 1457 char buf[100]; 1458 char *tname; 1459 tree decl, type, name, t; 1460 struct function *act_cfun = cfun; 1461 static unsigned loopfn_num; 1462 1463 loc = LOCATION_LOCUS (loc); 1464 snprintf (buf, 100, "%s.$loopfn", current_function_name ()); 1465 ASM_FORMAT_PRIVATE_NAME (tname, buf, loopfn_num++); 1466 clean_symbol_name (tname); 1467 name = get_identifier (tname); 1468 type = build_function_type_list (void_type_node, ptr_type_node, NULL_TREE); 1469 1470 decl = build_decl (loc, FUNCTION_DECL, name, type); 1471 TREE_STATIC (decl) = 1; 1472 TREE_USED (decl) = 1; 1473 DECL_ARTIFICIAL (decl) = 1; 1474 DECL_IGNORED_P (decl) = 0; 1475 TREE_PUBLIC (decl) = 0; 1476 DECL_UNINLINABLE (decl) = 1; 1477 DECL_EXTERNAL (decl) = 0; 1478 DECL_CONTEXT (decl) = NULL_TREE; 1479 DECL_INITIAL (decl) = make_node (BLOCK); 1480 BLOCK_SUPERCONTEXT (DECL_INITIAL (decl)) = decl; 1481 1482 t = build_decl (loc, RESULT_DECL, NULL_TREE, void_type_node); 1483 DECL_ARTIFICIAL (t) = 1; 1484 DECL_IGNORED_P (t) = 1; 1485 DECL_RESULT (decl) = t; 1486 1487 t = build_decl (loc, PARM_DECL, get_identifier (".paral_data_param"), 1488 ptr_type_node); 1489 DECL_ARTIFICIAL (t) = 1; 1490 DECL_ARG_TYPE (t) = ptr_type_node; 1491 DECL_CONTEXT (t) = decl; 1492 TREE_USED (t) = 1; 1493 DECL_ARGUMENTS (decl) = t; 1494 1495 allocate_struct_function (decl, false); 1496 1497 /* The call to allocate_struct_function clobbers CFUN, so we need to restore 1498 it. */ 1499 set_cfun (act_cfun); 1500 1501 return decl; 1502 } 1503 1504 /* Replace uses of NAME by VAL in block BB. */ 1505 1506 static void 1507 replace_uses_in_bb_by (tree name, tree val, basic_block bb) 1508 { 1509 gimple *use_stmt; 1510 imm_use_iterator imm_iter; 1511 1512 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, name) 1513 { 1514 if (gimple_bb (use_stmt) != bb) 1515 continue; 1516 1517 use_operand_p use_p; 1518 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter) 1519 SET_USE (use_p, val); 1520 } 1521 } 1522 1523 /* Do transformation from: 1524 1525 <bb preheader>: 1526 ... 1527 goto <bb header> 1528 1529 <bb header>: 1530 ivtmp_a = PHI <ivtmp_init (preheader), ivtmp_b (latch)> 1531 sum_a = PHI <sum_init (preheader), sum_b (latch)> 1532 ... 1533 use (ivtmp_a) 1534 ... 1535 sum_b = sum_a + sum_update 1536 ... 1537 if (ivtmp_a < n) 1538 goto <bb latch>; 1539 else 1540 goto <bb exit>; 1541 1542 <bb latch>: 1543 ivtmp_b = ivtmp_a + 1; 1544 goto <bb header> 1545 1546 <bb exit>: 1547 sum_z = PHI <sum_b (cond[1]), ...> 1548 1549 [1] Where <bb cond> is single_pred (bb latch); In the simplest case, 1550 that's <bb header>. 1551 1552 to: 1553 1554 <bb preheader>: 1555 ... 1556 goto <bb newheader> 1557 1558 <bb header>: 1559 ivtmp_a = PHI <ivtmp_c (latch)> 1560 sum_a = PHI <sum_c (latch)> 1561 ... 1562 use (ivtmp_a) 1563 ... 1564 sum_b = sum_a + sum_update 1565 ... 1566 goto <bb latch>; 1567 1568 <bb newheader>: 1569 ivtmp_c = PHI <ivtmp_init (preheader), ivtmp_b (latch)> 1570 sum_c = PHI <sum_init (preheader), sum_b (latch)> 1571 if (ivtmp_c < n + 1) 1572 goto <bb header>; 1573 else 1574 goto <bb newexit>; 1575 1576 <bb latch>: 1577 ivtmp_b = ivtmp_a + 1; 1578 goto <bb newheader> 1579 1580 <bb newexit>: 1581 sum_y = PHI <sum_c (newheader)> 1582 1583 <bb exit>: 1584 sum_z = PHI <sum_y (newexit), ...> 1585 1586 1587 In unified diff format: 1588 1589 <bb preheader>: 1590 ... 1591 - goto <bb header> 1592 + goto <bb newheader> 1593 1594 <bb header>: 1595 - ivtmp_a = PHI <ivtmp_init (preheader), ivtmp_b (latch)> 1596 - sum_a = PHI <sum_init (preheader), sum_b (latch)> 1597 + ivtmp_a = PHI <ivtmp_c (latch)> 1598 + sum_a = PHI <sum_c (latch)> 1599 ... 1600 use (ivtmp_a) 1601 ... 1602 sum_b = sum_a + sum_update 1603 ... 1604 - if (ivtmp_a < n) 1605 - goto <bb latch>; 1606 + goto <bb latch>; 1607 + 1608 + <bb newheader>: 1609 + ivtmp_c = PHI <ivtmp_init (preheader), ivtmp_b (latch)> 1610 + sum_c = PHI <sum_init (preheader), sum_b (latch)> 1611 + if (ivtmp_c < n + 1) 1612 + goto <bb header>; 1613 else 1614 goto <bb exit>; 1615 1616 <bb latch>: 1617 ivtmp_b = ivtmp_a + 1; 1618 - goto <bb header> 1619 + goto <bb newheader> 1620 1621 + <bb newexit>: 1622 + sum_y = PHI <sum_c (newheader)> 1623 1624 <bb exit>: 1625 - sum_z = PHI <sum_b (cond[1]), ...> 1626 + sum_z = PHI <sum_y (newexit), ...> 1627 1628 Note: the example does not show any virtual phis, but these are handled more 1629 or less as reductions. 1630 1631 1632 Moves the exit condition of LOOP to the beginning of its header. 1633 REDUCTION_LIST describes the reductions in LOOP. BOUND is the new loop 1634 bound. */ 1635 1636 static void 1637 transform_to_exit_first_loop_alt (struct loop *loop, 1638 reduction_info_table_type *reduction_list, 1639 tree bound) 1640 { 1641 basic_block header = loop->header; 1642 basic_block latch = loop->latch; 1643 edge exit = single_dom_exit (loop); 1644 basic_block exit_block = exit->dest; 1645 gcond *cond_stmt = as_a <gcond *> (last_stmt (exit->src)); 1646 tree control = gimple_cond_lhs (cond_stmt); 1647 edge e; 1648 1649 /* Rewriting virtuals into loop-closed ssa normal form makes this 1650 transformation simpler. It also ensures that the virtuals are in 1651 loop-closed ssa normal from after the transformation, which is required by 1652 create_parallel_loop. */ 1653 rewrite_virtuals_into_loop_closed_ssa (loop); 1654 1655 /* Create the new_header block. */ 1656 basic_block new_header = split_block_before_cond_jump (exit->src); 1657 edge edge_at_split = single_pred_edge (new_header); 1658 1659 /* Redirect entry edge to new_header. */ 1660 edge entry = loop_preheader_edge (loop); 1661 e = redirect_edge_and_branch (entry, new_header); 1662 gcc_assert (e == entry); 1663 1664 /* Redirect post_inc_edge to new_header. */ 1665 edge post_inc_edge = single_succ_edge (latch); 1666 e = redirect_edge_and_branch (post_inc_edge, new_header); 1667 gcc_assert (e == post_inc_edge); 1668 1669 /* Redirect post_cond_edge to header. */ 1670 edge post_cond_edge = single_pred_edge (latch); 1671 e = redirect_edge_and_branch (post_cond_edge, header); 1672 gcc_assert (e == post_cond_edge); 1673 1674 /* Redirect edge_at_split to latch. */ 1675 e = redirect_edge_and_branch (edge_at_split, latch); 1676 gcc_assert (e == edge_at_split); 1677 1678 /* Set the new loop bound. */ 1679 gimple_cond_set_rhs (cond_stmt, bound); 1680 update_stmt (cond_stmt); 1681 1682 /* Repair the ssa. */ 1683 vec<edge_var_map> *v = redirect_edge_var_map_vector (post_inc_edge); 1684 edge_var_map *vm; 1685 gphi_iterator gsi; 1686 int i; 1687 for (gsi = gsi_start_phis (header), i = 0; 1688 !gsi_end_p (gsi) && v->iterate (i, &vm); 1689 gsi_next (&gsi), i++) 1690 { 1691 gphi *phi = gsi.phi (); 1692 tree res_a = PHI_RESULT (phi); 1693 1694 /* Create new phi. */ 1695 tree res_c = copy_ssa_name (res_a, phi); 1696 gphi *nphi = create_phi_node (res_c, new_header); 1697 1698 /* Replace ivtmp_a with ivtmp_c in condition 'if (ivtmp_a < n)'. */ 1699 replace_uses_in_bb_by (res_a, res_c, new_header); 1700 1701 /* Replace ivtmp/sum_b with ivtmp/sum_c in header phi. */ 1702 add_phi_arg (phi, res_c, post_cond_edge, UNKNOWN_LOCATION); 1703 1704 /* Replace sum_b with sum_c in exit phi. */ 1705 tree res_b = redirect_edge_var_map_def (vm); 1706 replace_uses_in_bb_by (res_b, res_c, exit_block); 1707 1708 struct reduction_info *red = reduction_phi (reduction_list, phi); 1709 gcc_assert (virtual_operand_p (res_a) 1710 || res_a == control 1711 || red != NULL); 1712 1713 if (red) 1714 { 1715 /* Register the new reduction phi. */ 1716 red->reduc_phi = nphi; 1717 gimple_set_uid (red->reduc_phi, red->reduc_version); 1718 } 1719 } 1720 gcc_assert (gsi_end_p (gsi) && !v->iterate (i, &vm)); 1721 1722 /* Set the preheader argument of the new phis to ivtmp/sum_init. */ 1723 flush_pending_stmts (entry); 1724 1725 /* Set the latch arguments of the new phis to ivtmp/sum_b. */ 1726 flush_pending_stmts (post_inc_edge); 1727 1728 1729 basic_block new_exit_block = NULL; 1730 if (!single_pred_p (exit->dest)) 1731 { 1732 /* Create a new empty exit block, inbetween the new loop header and the 1733 old exit block. The function separate_decls_in_region needs this block 1734 to insert code that is active on loop exit, but not any other path. */ 1735 new_exit_block = split_edge (exit); 1736 } 1737 1738 /* Insert and register the reduction exit phis. */ 1739 for (gphi_iterator gsi = gsi_start_phis (exit_block); 1740 !gsi_end_p (gsi); 1741 gsi_next (&gsi)) 1742 { 1743 gphi *phi = gsi.phi (); 1744 gphi *nphi = NULL; 1745 tree res_z = PHI_RESULT (phi); 1746 tree res_c; 1747 1748 if (new_exit_block != NULL) 1749 { 1750 /* Now that we have a new exit block, duplicate the phi of the old 1751 exit block in the new exit block to preserve loop-closed ssa. */ 1752 edge succ_new_exit_block = single_succ_edge (new_exit_block); 1753 edge pred_new_exit_block = single_pred_edge (new_exit_block); 1754 tree res_y = copy_ssa_name (res_z, phi); 1755 nphi = create_phi_node (res_y, new_exit_block); 1756 res_c = PHI_ARG_DEF_FROM_EDGE (phi, succ_new_exit_block); 1757 add_phi_arg (nphi, res_c, pred_new_exit_block, UNKNOWN_LOCATION); 1758 add_phi_arg (phi, res_y, succ_new_exit_block, UNKNOWN_LOCATION); 1759 } 1760 else 1761 res_c = PHI_ARG_DEF_FROM_EDGE (phi, exit); 1762 1763 if (virtual_operand_p (res_z)) 1764 continue; 1765 1766 gimple *reduc_phi = SSA_NAME_DEF_STMT (res_c); 1767 struct reduction_info *red = reduction_phi (reduction_list, reduc_phi); 1768 if (red != NULL) 1769 red->keep_res = (nphi != NULL 1770 ? nphi 1771 : phi); 1772 } 1773 1774 /* We're going to cancel the loop at the end of gen_parallel_loop, but until 1775 then we're still using some fields, so only bother about fields that are 1776 still used: header and latch. 1777 The loop has a new header bb, so we update it. The latch bb stays the 1778 same. */ 1779 loop->header = new_header; 1780 1781 /* Recalculate dominance info. */ 1782 free_dominance_info (CDI_DOMINATORS); 1783 calculate_dominance_info (CDI_DOMINATORS); 1784 1785 checking_verify_ssa (true, true); 1786 } 1787 1788 /* Tries to moves the exit condition of LOOP to the beginning of its header 1789 without duplication of the loop body. NIT is the number of iterations of the 1790 loop. REDUCTION_LIST describes the reductions in LOOP. Return true if 1791 transformation is successful. */ 1792 1793 static bool 1794 try_transform_to_exit_first_loop_alt (struct loop *loop, 1795 reduction_info_table_type *reduction_list, 1796 tree nit) 1797 { 1798 /* Check whether the latch contains a single statement. */ 1799 if (!gimple_seq_nondebug_singleton_p (bb_seq (loop->latch))) 1800 return false; 1801 1802 /* Check whether the latch contains no phis. */ 1803 if (phi_nodes (loop->latch) != NULL) 1804 return false; 1805 1806 /* Check whether the latch contains the loop iv increment. */ 1807 edge back = single_succ_edge (loop->latch); 1808 edge exit = single_dom_exit (loop); 1809 gcond *cond_stmt = as_a <gcond *> (last_stmt (exit->src)); 1810 tree control = gimple_cond_lhs (cond_stmt); 1811 gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (control)); 1812 tree inc_res = gimple_phi_arg_def (phi, back->dest_idx); 1813 if (gimple_bb (SSA_NAME_DEF_STMT (inc_res)) != loop->latch) 1814 return false; 1815 1816 /* Check whether there's no code between the loop condition and the latch. */ 1817 if (!single_pred_p (loop->latch) 1818 || single_pred (loop->latch) != exit->src) 1819 return false; 1820 1821 tree alt_bound = NULL_TREE; 1822 tree nit_type = TREE_TYPE (nit); 1823 1824 /* Figure out whether nit + 1 overflows. */ 1825 if (TREE_CODE (nit) == INTEGER_CST) 1826 { 1827 if (!tree_int_cst_equal (nit, TYPE_MAXVAL (nit_type))) 1828 { 1829 alt_bound = fold_build2_loc (UNKNOWN_LOCATION, PLUS_EXPR, nit_type, 1830 nit, build_one_cst (nit_type)); 1831 1832 gcc_assert (TREE_CODE (alt_bound) == INTEGER_CST); 1833 transform_to_exit_first_loop_alt (loop, reduction_list, alt_bound); 1834 return true; 1835 } 1836 else 1837 { 1838 /* Todo: Figure out if we can trigger this, if it's worth to handle 1839 optimally, and if we can handle it optimally. */ 1840 return false; 1841 } 1842 } 1843 1844 gcc_assert (TREE_CODE (nit) == SSA_NAME); 1845 1846 /* Variable nit is the loop bound as returned by canonicalize_loop_ivs, for an 1847 iv with base 0 and step 1 that is incremented in the latch, like this: 1848 1849 <bb header>: 1850 # iv_1 = PHI <0 (preheader), iv_2 (latch)> 1851 ... 1852 if (iv_1 < nit) 1853 goto <bb latch>; 1854 else 1855 goto <bb exit>; 1856 1857 <bb latch>: 1858 iv_2 = iv_1 + 1; 1859 goto <bb header>; 1860 1861 The range of iv_1 is [0, nit]. The latch edge is taken for 1862 iv_1 == [0, nit - 1] and the exit edge is taken for iv_1 == nit. So the 1863 number of latch executions is equal to nit. 1864 1865 The function max_loop_iterations gives us the maximum number of latch 1866 executions, so it gives us the maximum value of nit. */ 1867 widest_int nit_max; 1868 if (!max_loop_iterations (loop, &nit_max)) 1869 return false; 1870 1871 /* Check if nit + 1 overflows. */ 1872 widest_int type_max = wi::to_widest (TYPE_MAXVAL (nit_type)); 1873 if (nit_max >= type_max) 1874 return false; 1875 1876 gimple *def = SSA_NAME_DEF_STMT (nit); 1877 1878 /* Try to find nit + 1, in the form of n in an assignment nit = n - 1. */ 1879 if (def 1880 && is_gimple_assign (def) 1881 && gimple_assign_rhs_code (def) == PLUS_EXPR) 1882 { 1883 tree op1 = gimple_assign_rhs1 (def); 1884 tree op2 = gimple_assign_rhs2 (def); 1885 if (integer_minus_onep (op1)) 1886 alt_bound = op2; 1887 else if (integer_minus_onep (op2)) 1888 alt_bound = op1; 1889 } 1890 1891 /* If not found, insert nit + 1. */ 1892 if (alt_bound == NULL_TREE) 1893 { 1894 alt_bound = fold_build2 (PLUS_EXPR, nit_type, nit, 1895 build_int_cst_type (nit_type, 1)); 1896 1897 gimple_stmt_iterator gsi = gsi_last_bb (loop_preheader_edge (loop)->src); 1898 1899 alt_bound 1900 = force_gimple_operand_gsi (&gsi, alt_bound, true, NULL_TREE, false, 1901 GSI_CONTINUE_LINKING); 1902 } 1903 1904 transform_to_exit_first_loop_alt (loop, reduction_list, alt_bound); 1905 return true; 1906 } 1907 1908 /* Moves the exit condition of LOOP to the beginning of its header. NIT is the 1909 number of iterations of the loop. REDUCTION_LIST describes the reductions in 1910 LOOP. */ 1911 1912 static void 1913 transform_to_exit_first_loop (struct loop *loop, 1914 reduction_info_table_type *reduction_list, 1915 tree nit) 1916 { 1917 basic_block *bbs, *nbbs, ex_bb, orig_header; 1918 unsigned n; 1919 bool ok; 1920 edge exit = single_dom_exit (loop), hpred; 1921 tree control, control_name, res, t; 1922 gphi *phi, *nphi; 1923 gassign *stmt; 1924 gcond *cond_stmt, *cond_nit; 1925 tree nit_1; 1926 1927 split_block_after_labels (loop->header); 1928 orig_header = single_succ (loop->header); 1929 hpred = single_succ_edge (loop->header); 1930 1931 cond_stmt = as_a <gcond *> (last_stmt (exit->src)); 1932 control = gimple_cond_lhs (cond_stmt); 1933 gcc_assert (gimple_cond_rhs (cond_stmt) == nit); 1934 1935 /* Make sure that we have phi nodes on exit for all loop header phis 1936 (create_parallel_loop requires that). */ 1937 for (gphi_iterator gsi = gsi_start_phis (loop->header); 1938 !gsi_end_p (gsi); 1939 gsi_next (&gsi)) 1940 { 1941 phi = gsi.phi (); 1942 res = PHI_RESULT (phi); 1943 t = copy_ssa_name (res, phi); 1944 SET_PHI_RESULT (phi, t); 1945 nphi = create_phi_node (res, orig_header); 1946 add_phi_arg (nphi, t, hpred, UNKNOWN_LOCATION); 1947 1948 if (res == control) 1949 { 1950 gimple_cond_set_lhs (cond_stmt, t); 1951 update_stmt (cond_stmt); 1952 control = t; 1953 } 1954 } 1955 1956 bbs = get_loop_body_in_dom_order (loop); 1957 1958 for (n = 0; bbs[n] != exit->src; n++) 1959 continue; 1960 nbbs = XNEWVEC (basic_block, n); 1961 ok = gimple_duplicate_sese_tail (single_succ_edge (loop->header), exit, 1962 bbs + 1, n, nbbs); 1963 gcc_assert (ok); 1964 free (bbs); 1965 ex_bb = nbbs[0]; 1966 free (nbbs); 1967 1968 /* Other than reductions, the only gimple reg that should be copied 1969 out of the loop is the control variable. */ 1970 exit = single_dom_exit (loop); 1971 control_name = NULL_TREE; 1972 for (gphi_iterator gsi = gsi_start_phis (ex_bb); 1973 !gsi_end_p (gsi); ) 1974 { 1975 phi = gsi.phi (); 1976 res = PHI_RESULT (phi); 1977 if (virtual_operand_p (res)) 1978 { 1979 gsi_next (&gsi); 1980 continue; 1981 } 1982 1983 /* Check if it is a part of reduction. If it is, 1984 keep the phi at the reduction's keep_res field. The 1985 PHI_RESULT of this phi is the resulting value of the reduction 1986 variable when exiting the loop. */ 1987 1988 if (reduction_list->elements () > 0) 1989 { 1990 struct reduction_info *red; 1991 1992 tree val = PHI_ARG_DEF_FROM_EDGE (phi, exit); 1993 red = reduction_phi (reduction_list, SSA_NAME_DEF_STMT (val)); 1994 if (red) 1995 { 1996 red->keep_res = phi; 1997 gsi_next (&gsi); 1998 continue; 1999 } 2000 } 2001 gcc_assert (control_name == NULL_TREE 2002 && SSA_NAME_VAR (res) == SSA_NAME_VAR (control)); 2003 control_name = res; 2004 remove_phi_node (&gsi, false); 2005 } 2006 gcc_assert (control_name != NULL_TREE); 2007 2008 /* Initialize the control variable to number of iterations 2009 according to the rhs of the exit condition. */ 2010 gimple_stmt_iterator gsi = gsi_after_labels (ex_bb); 2011 cond_nit = as_a <gcond *> (last_stmt (exit->src)); 2012 nit_1 = gimple_cond_rhs (cond_nit); 2013 nit_1 = force_gimple_operand_gsi (&gsi, 2014 fold_convert (TREE_TYPE (control_name), nit_1), 2015 false, NULL_TREE, false, GSI_SAME_STMT); 2016 stmt = gimple_build_assign (control_name, nit_1); 2017 gsi_insert_before (&gsi, stmt, GSI_NEW_STMT); 2018 } 2019 2020 /* Create the parallel constructs for LOOP as described in gen_parallel_loop. 2021 LOOP_FN and DATA are the arguments of GIMPLE_OMP_PARALLEL. 2022 NEW_DATA is the variable that should be initialized from the argument 2023 of LOOP_FN. N_THREADS is the requested number of threads, which can be 0 if 2024 that number is to be determined later. */ 2025 2026 static void 2027 create_parallel_loop (struct loop *loop, tree loop_fn, tree data, 2028 tree new_data, unsigned n_threads, location_t loc, 2029 bool oacc_kernels_p) 2030 { 2031 gimple_stmt_iterator gsi; 2032 basic_block for_bb, ex_bb, continue_bb; 2033 tree t, param; 2034 gomp_parallel *omp_par_stmt; 2035 gimple *omp_return_stmt1, *omp_return_stmt2; 2036 gimple *phi; 2037 gcond *cond_stmt; 2038 gomp_for *for_stmt; 2039 gomp_continue *omp_cont_stmt; 2040 tree cvar, cvar_init, initvar, cvar_next, cvar_base, type; 2041 edge exit, nexit, guard, end, e; 2042 2043 /* Prepare the GIMPLE_OMP_PARALLEL statement. */ 2044 if (oacc_kernels_p) 2045 { 2046 tree clause = build_omp_clause (loc, OMP_CLAUSE_NUM_GANGS); 2047 OMP_CLAUSE_NUM_GANGS_EXPR (clause) 2048 = build_int_cst (integer_type_node, n_threads); 2049 oacc_set_fn_attrib (cfun->decl, clause, true, NULL); 2050 } 2051 else 2052 { 2053 basic_block bb = loop_preheader_edge (loop)->src; 2054 basic_block paral_bb = single_pred (bb); 2055 gsi = gsi_last_bb (paral_bb); 2056 2057 gcc_checking_assert (n_threads != 0); 2058 t = build_omp_clause (loc, OMP_CLAUSE_NUM_THREADS); 2059 OMP_CLAUSE_NUM_THREADS_EXPR (t) 2060 = build_int_cst (integer_type_node, n_threads); 2061 omp_par_stmt = gimple_build_omp_parallel (NULL, t, loop_fn, data); 2062 gimple_set_location (omp_par_stmt, loc); 2063 2064 gsi_insert_after (&gsi, omp_par_stmt, GSI_NEW_STMT); 2065 2066 /* Initialize NEW_DATA. */ 2067 if (data) 2068 { 2069 gassign *assign_stmt; 2070 2071 gsi = gsi_after_labels (bb); 2072 2073 param = make_ssa_name (DECL_ARGUMENTS (loop_fn)); 2074 assign_stmt = gimple_build_assign (param, build_fold_addr_expr (data)); 2075 gsi_insert_before (&gsi, assign_stmt, GSI_SAME_STMT); 2076 2077 assign_stmt = gimple_build_assign (new_data, 2078 fold_convert (TREE_TYPE (new_data), param)); 2079 gsi_insert_before (&gsi, assign_stmt, GSI_SAME_STMT); 2080 } 2081 2082 /* Emit GIMPLE_OMP_RETURN for GIMPLE_OMP_PARALLEL. */ 2083 bb = split_loop_exit_edge (single_dom_exit (loop)); 2084 gsi = gsi_last_bb (bb); 2085 omp_return_stmt1 = gimple_build_omp_return (false); 2086 gimple_set_location (omp_return_stmt1, loc); 2087 gsi_insert_after (&gsi, omp_return_stmt1, GSI_NEW_STMT); 2088 } 2089 2090 /* Extract data for GIMPLE_OMP_FOR. */ 2091 gcc_assert (loop->header == single_dom_exit (loop)->src); 2092 cond_stmt = as_a <gcond *> (last_stmt (loop->header)); 2093 2094 cvar = gimple_cond_lhs (cond_stmt); 2095 cvar_base = SSA_NAME_VAR (cvar); 2096 phi = SSA_NAME_DEF_STMT (cvar); 2097 cvar_init = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop)); 2098 initvar = copy_ssa_name (cvar); 2099 SET_USE (PHI_ARG_DEF_PTR_FROM_EDGE (phi, loop_preheader_edge (loop)), 2100 initvar); 2101 cvar_next = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop)); 2102 2103 gsi = gsi_last_nondebug_bb (loop->latch); 2104 gcc_assert (gsi_stmt (gsi) == SSA_NAME_DEF_STMT (cvar_next)); 2105 gsi_remove (&gsi, true); 2106 2107 /* Prepare cfg. */ 2108 for_bb = split_edge (loop_preheader_edge (loop)); 2109 ex_bb = split_loop_exit_edge (single_dom_exit (loop)); 2110 extract_true_false_edges_from_block (loop->header, &nexit, &exit); 2111 gcc_assert (exit == single_dom_exit (loop)); 2112 2113 guard = make_edge (for_bb, ex_bb, 0); 2114 /* Split the latch edge, so LOOPS_HAVE_SIMPLE_LATCHES is still valid. */ 2115 loop->latch = split_edge (single_succ_edge (loop->latch)); 2116 single_pred_edge (loop->latch)->flags = 0; 2117 end = make_edge (single_pred (loop->latch), ex_bb, EDGE_FALLTHRU); 2118 rescan_loop_exit (end, true, false); 2119 2120 for (gphi_iterator gpi = gsi_start_phis (ex_bb); 2121 !gsi_end_p (gpi); gsi_next (&gpi)) 2122 { 2123 source_location locus; 2124 gphi *phi = gpi.phi (); 2125 tree def = PHI_ARG_DEF_FROM_EDGE (phi, exit); 2126 gimple *def_stmt = SSA_NAME_DEF_STMT (def); 2127 2128 /* If the exit phi is not connected to a header phi in the same loop, this 2129 value is not modified in the loop, and we're done with this phi. */ 2130 if (!(gimple_code (def_stmt) == GIMPLE_PHI 2131 && gimple_bb (def_stmt) == loop->header)) 2132 { 2133 locus = gimple_phi_arg_location_from_edge (phi, exit); 2134 add_phi_arg (phi, def, guard, locus); 2135 add_phi_arg (phi, def, end, locus); 2136 continue; 2137 } 2138 2139 gphi *stmt = as_a <gphi *> (def_stmt); 2140 def = PHI_ARG_DEF_FROM_EDGE (stmt, loop_preheader_edge (loop)); 2141 locus = gimple_phi_arg_location_from_edge (stmt, 2142 loop_preheader_edge (loop)); 2143 add_phi_arg (phi, def, guard, locus); 2144 2145 def = PHI_ARG_DEF_FROM_EDGE (stmt, loop_latch_edge (loop)); 2146 locus = gimple_phi_arg_location_from_edge (stmt, loop_latch_edge (loop)); 2147 add_phi_arg (phi, def, end, locus); 2148 } 2149 e = redirect_edge_and_branch (exit, nexit->dest); 2150 PENDING_STMT (e) = NULL; 2151 2152 /* Emit GIMPLE_OMP_FOR. */ 2153 if (oacc_kernels_p) 2154 /* In combination with the NUM_GANGS on the parallel. */ 2155 t = build_omp_clause (loc, OMP_CLAUSE_GANG); 2156 else 2157 { 2158 t = build_omp_clause (loc, OMP_CLAUSE_SCHEDULE); 2159 int chunk_size = PARAM_VALUE (PARAM_PARLOOPS_CHUNK_SIZE); 2160 enum PARAM_PARLOOPS_SCHEDULE_KIND schedule_type \ 2161 = (enum PARAM_PARLOOPS_SCHEDULE_KIND) PARAM_VALUE (PARAM_PARLOOPS_SCHEDULE); 2162 switch (schedule_type) 2163 { 2164 case PARAM_PARLOOPS_SCHEDULE_KIND_static: 2165 OMP_CLAUSE_SCHEDULE_KIND (t) = OMP_CLAUSE_SCHEDULE_STATIC; 2166 break; 2167 case PARAM_PARLOOPS_SCHEDULE_KIND_dynamic: 2168 OMP_CLAUSE_SCHEDULE_KIND (t) = OMP_CLAUSE_SCHEDULE_DYNAMIC; 2169 break; 2170 case PARAM_PARLOOPS_SCHEDULE_KIND_guided: 2171 OMP_CLAUSE_SCHEDULE_KIND (t) = OMP_CLAUSE_SCHEDULE_GUIDED; 2172 break; 2173 case PARAM_PARLOOPS_SCHEDULE_KIND_auto: 2174 OMP_CLAUSE_SCHEDULE_KIND (t) = OMP_CLAUSE_SCHEDULE_AUTO; 2175 chunk_size = 0; 2176 break; 2177 case PARAM_PARLOOPS_SCHEDULE_KIND_runtime: 2178 OMP_CLAUSE_SCHEDULE_KIND (t) = OMP_CLAUSE_SCHEDULE_RUNTIME; 2179 chunk_size = 0; 2180 break; 2181 default: 2182 gcc_unreachable (); 2183 } 2184 if (chunk_size != 0) 2185 OMP_CLAUSE_SCHEDULE_CHUNK_EXPR (t) 2186 = build_int_cst (integer_type_node, chunk_size); 2187 } 2188 2189 for_stmt = gimple_build_omp_for (NULL, 2190 (oacc_kernels_p 2191 ? GF_OMP_FOR_KIND_OACC_LOOP 2192 : GF_OMP_FOR_KIND_FOR), 2193 t, 1, NULL); 2194 2195 gimple_cond_set_lhs (cond_stmt, cvar_base); 2196 type = TREE_TYPE (cvar); 2197 gimple_set_location (for_stmt, loc); 2198 gimple_omp_for_set_index (for_stmt, 0, initvar); 2199 gimple_omp_for_set_initial (for_stmt, 0, cvar_init); 2200 gimple_omp_for_set_final (for_stmt, 0, gimple_cond_rhs (cond_stmt)); 2201 gimple_omp_for_set_cond (for_stmt, 0, gimple_cond_code (cond_stmt)); 2202 gimple_omp_for_set_incr (for_stmt, 0, build2 (PLUS_EXPR, type, 2203 cvar_base, 2204 build_int_cst (type, 1))); 2205 2206 gsi = gsi_last_bb (for_bb); 2207 gsi_insert_after (&gsi, for_stmt, GSI_NEW_STMT); 2208 SSA_NAME_DEF_STMT (initvar) = for_stmt; 2209 2210 /* Emit GIMPLE_OMP_CONTINUE. */ 2211 continue_bb = single_pred (loop->latch); 2212 gsi = gsi_last_bb (continue_bb); 2213 omp_cont_stmt = gimple_build_omp_continue (cvar_next, cvar); 2214 gimple_set_location (omp_cont_stmt, loc); 2215 gsi_insert_after (&gsi, omp_cont_stmt, GSI_NEW_STMT); 2216 SSA_NAME_DEF_STMT (cvar_next) = omp_cont_stmt; 2217 2218 /* Emit GIMPLE_OMP_RETURN for GIMPLE_OMP_FOR. */ 2219 gsi = gsi_last_bb (ex_bb); 2220 omp_return_stmt2 = gimple_build_omp_return (true); 2221 gimple_set_location (omp_return_stmt2, loc); 2222 gsi_insert_after (&gsi, omp_return_stmt2, GSI_NEW_STMT); 2223 2224 /* After the above dom info is hosed. Re-compute it. */ 2225 free_dominance_info (CDI_DOMINATORS); 2226 calculate_dominance_info (CDI_DOMINATORS); 2227 } 2228 2229 /* Generates code to execute the iterations of LOOP in N_THREADS 2230 threads in parallel, which can be 0 if that number is to be determined 2231 later. 2232 2233 NITER describes number of iterations of LOOP. 2234 REDUCTION_LIST describes the reductions existent in the LOOP. */ 2235 2236 static void 2237 gen_parallel_loop (struct loop *loop, 2238 reduction_info_table_type *reduction_list, 2239 unsigned n_threads, struct tree_niter_desc *niter, 2240 bool oacc_kernels_p) 2241 { 2242 tree many_iterations_cond, type, nit; 2243 tree arg_struct, new_arg_struct; 2244 gimple_seq stmts; 2245 edge entry, exit; 2246 struct clsn_data clsn_data; 2247 unsigned prob; 2248 location_t loc; 2249 gimple *cond_stmt; 2250 unsigned int m_p_thread=2; 2251 2252 /* From 2253 2254 --------------------------------------------------------------------- 2255 loop 2256 { 2257 IV = phi (INIT, IV + STEP) 2258 BODY1; 2259 if (COND) 2260 break; 2261 BODY2; 2262 } 2263 --------------------------------------------------------------------- 2264 2265 with # of iterations NITER (possibly with MAY_BE_ZERO assumption), 2266 we generate the following code: 2267 2268 --------------------------------------------------------------------- 2269 2270 if (MAY_BE_ZERO 2271 || NITER < MIN_PER_THREAD * N_THREADS) 2272 goto original; 2273 2274 BODY1; 2275 store all local loop-invariant variables used in body of the loop to DATA. 2276 GIMPLE_OMP_PARALLEL (OMP_CLAUSE_NUM_THREADS (N_THREADS), LOOPFN, DATA); 2277 load the variables from DATA. 2278 GIMPLE_OMP_FOR (IV = INIT; COND; IV += STEP) (OMP_CLAUSE_SCHEDULE (static)) 2279 BODY2; 2280 BODY1; 2281 GIMPLE_OMP_CONTINUE; 2282 GIMPLE_OMP_RETURN -- GIMPLE_OMP_FOR 2283 GIMPLE_OMP_RETURN -- GIMPLE_OMP_PARALLEL 2284 goto end; 2285 2286 original: 2287 loop 2288 { 2289 IV = phi (INIT, IV + STEP) 2290 BODY1; 2291 if (COND) 2292 break; 2293 BODY2; 2294 } 2295 2296 end: 2297 2298 */ 2299 2300 /* Create two versions of the loop -- in the old one, we know that the 2301 number of iterations is large enough, and we will transform it into the 2302 loop that will be split to loop_fn, the new one will be used for the 2303 remaining iterations. */ 2304 2305 /* We should compute a better number-of-iterations value for outer loops. 2306 That is, if we have 2307 2308 for (i = 0; i < n; ++i) 2309 for (j = 0; j < m; ++j) 2310 ... 2311 2312 we should compute nit = n * m, not nit = n. 2313 Also may_be_zero handling would need to be adjusted. */ 2314 2315 type = TREE_TYPE (niter->niter); 2316 nit = force_gimple_operand (unshare_expr (niter->niter), &stmts, true, 2317 NULL_TREE); 2318 if (stmts) 2319 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts); 2320 2321 if (!oacc_kernels_p) 2322 { 2323 if (loop->inner) 2324 m_p_thread=2; 2325 else 2326 m_p_thread=MIN_PER_THREAD; 2327 2328 gcc_checking_assert (n_threads != 0); 2329 many_iterations_cond = 2330 fold_build2 (GE_EXPR, boolean_type_node, 2331 nit, build_int_cst (type, m_p_thread * n_threads)); 2332 2333 many_iterations_cond 2334 = fold_build2 (TRUTH_AND_EXPR, boolean_type_node, 2335 invert_truthvalue (unshare_expr (niter->may_be_zero)), 2336 many_iterations_cond); 2337 many_iterations_cond 2338 = force_gimple_operand (many_iterations_cond, &stmts, false, NULL_TREE); 2339 if (stmts) 2340 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts); 2341 if (!is_gimple_condexpr (many_iterations_cond)) 2342 { 2343 many_iterations_cond 2344 = force_gimple_operand (many_iterations_cond, &stmts, 2345 true, NULL_TREE); 2346 if (stmts) 2347 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), 2348 stmts); 2349 } 2350 2351 initialize_original_copy_tables (); 2352 2353 /* We assume that the loop usually iterates a lot. */ 2354 prob = 4 * REG_BR_PROB_BASE / 5; 2355 loop_version (loop, many_iterations_cond, NULL, 2356 prob, REG_BR_PROB_BASE - prob, 2357 prob, REG_BR_PROB_BASE - prob, true); 2358 update_ssa (TODO_update_ssa); 2359 free_original_copy_tables (); 2360 } 2361 2362 /* Base all the induction variables in LOOP on a single control one. */ 2363 canonicalize_loop_ivs (loop, &nit, true); 2364 2365 /* Ensure that the exit condition is the first statement in the loop. 2366 The common case is that latch of the loop is empty (apart from the 2367 increment) and immediately follows the loop exit test. Attempt to move the 2368 entry of the loop directly before the exit check and increase the number of 2369 iterations of the loop by one. */ 2370 if (try_transform_to_exit_first_loop_alt (loop, reduction_list, nit)) 2371 { 2372 if (dump_file 2373 && (dump_flags & TDF_DETAILS)) 2374 fprintf (dump_file, 2375 "alternative exit-first loop transform succeeded" 2376 " for loop %d\n", loop->num); 2377 } 2378 else 2379 { 2380 if (oacc_kernels_p) 2381 n_threads = 1; 2382 2383 /* Fall back on the method that handles more cases, but duplicates the 2384 loop body: move the exit condition of LOOP to the beginning of its 2385 header, and duplicate the part of the last iteration that gets disabled 2386 to the exit of the loop. */ 2387 transform_to_exit_first_loop (loop, reduction_list, nit); 2388 } 2389 2390 /* Generate initializations for reductions. */ 2391 if (reduction_list->elements () > 0) 2392 reduction_list->traverse <struct loop *, initialize_reductions> (loop); 2393 2394 /* Eliminate the references to local variables from the loop. */ 2395 gcc_assert (single_exit (loop)); 2396 entry = loop_preheader_edge (loop); 2397 exit = single_dom_exit (loop); 2398 2399 /* This rewrites the body in terms of new variables. This has already 2400 been done for oacc_kernels_p in pass_lower_omp/lower_omp (). */ 2401 if (!oacc_kernels_p) 2402 { 2403 eliminate_local_variables (entry, exit); 2404 /* In the old loop, move all variables non-local to the loop to a 2405 structure and back, and create separate decls for the variables used in 2406 loop. */ 2407 separate_decls_in_region (entry, exit, reduction_list, &arg_struct, 2408 &new_arg_struct, &clsn_data); 2409 } 2410 else 2411 { 2412 arg_struct = NULL_TREE; 2413 new_arg_struct = NULL_TREE; 2414 clsn_data.load = NULL_TREE; 2415 clsn_data.load_bb = exit->dest; 2416 clsn_data.store = NULL_TREE; 2417 clsn_data.store_bb = NULL; 2418 } 2419 2420 /* Create the parallel constructs. */ 2421 loc = UNKNOWN_LOCATION; 2422 cond_stmt = last_stmt (loop->header); 2423 if (cond_stmt) 2424 loc = gimple_location (cond_stmt); 2425 create_parallel_loop (loop, create_loop_fn (loc), arg_struct, new_arg_struct, 2426 n_threads, loc, oacc_kernels_p); 2427 if (reduction_list->elements () > 0) 2428 create_call_for_reduction (loop, reduction_list, &clsn_data); 2429 2430 scev_reset (); 2431 2432 /* Free loop bound estimations that could contain references to 2433 removed statements. */ 2434 FOR_EACH_LOOP (loop, 0) 2435 free_numbers_of_iterations_estimates_loop (loop); 2436 } 2437 2438 /* Returns true when LOOP contains vector phi nodes. */ 2439 2440 static bool 2441 loop_has_vector_phi_nodes (struct loop *loop ATTRIBUTE_UNUSED) 2442 { 2443 unsigned i; 2444 basic_block *bbs = get_loop_body_in_dom_order (loop); 2445 gphi_iterator gsi; 2446 bool res = true; 2447 2448 for (i = 0; i < loop->num_nodes; i++) 2449 for (gsi = gsi_start_phis (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi)) 2450 if (TREE_CODE (TREE_TYPE (PHI_RESULT (gsi.phi ()))) == VECTOR_TYPE) 2451 goto end; 2452 2453 res = false; 2454 end: 2455 free (bbs); 2456 return res; 2457 } 2458 2459 /* Create a reduction_info struct, initialize it with REDUC_STMT 2460 and PHI, insert it to the REDUCTION_LIST. */ 2461 2462 static void 2463 build_new_reduction (reduction_info_table_type *reduction_list, 2464 gimple *reduc_stmt, gphi *phi) 2465 { 2466 reduction_info **slot; 2467 struct reduction_info *new_reduction; 2468 enum tree_code reduction_code; 2469 2470 gcc_assert (reduc_stmt); 2471 2472 if (dump_file && (dump_flags & TDF_DETAILS)) 2473 { 2474 fprintf (dump_file, 2475 "Detected reduction. reduction stmt is:\n"); 2476 print_gimple_stmt (dump_file, reduc_stmt, 0, 0); 2477 fprintf (dump_file, "\n"); 2478 } 2479 2480 if (gimple_code (reduc_stmt) == GIMPLE_PHI) 2481 { 2482 tree op1 = PHI_ARG_DEF (reduc_stmt, 0); 2483 gimple *def1 = SSA_NAME_DEF_STMT (op1); 2484 reduction_code = gimple_assign_rhs_code (def1); 2485 } 2486 2487 else 2488 reduction_code = gimple_assign_rhs_code (reduc_stmt); 2489 2490 new_reduction = XCNEW (struct reduction_info); 2491 2492 new_reduction->reduc_stmt = reduc_stmt; 2493 new_reduction->reduc_phi = phi; 2494 new_reduction->reduc_version = SSA_NAME_VERSION (gimple_phi_result (phi)); 2495 new_reduction->reduction_code = reduction_code; 2496 slot = reduction_list->find_slot (new_reduction, INSERT); 2497 *slot = new_reduction; 2498 } 2499 2500 /* Callback for htab_traverse. Sets gimple_uid of reduc_phi stmts. */ 2501 2502 int 2503 set_reduc_phi_uids (reduction_info **slot, void *data ATTRIBUTE_UNUSED) 2504 { 2505 struct reduction_info *const red = *slot; 2506 gimple_set_uid (red->reduc_phi, red->reduc_version); 2507 return 1; 2508 } 2509 2510 /* Detect all reductions in the LOOP, insert them into REDUCTION_LIST. */ 2511 2512 static void 2513 gather_scalar_reductions (loop_p loop, reduction_info_table_type *reduction_list) 2514 { 2515 gphi_iterator gsi; 2516 loop_vec_info simple_loop_info; 2517 auto_vec<gphi *, 4> double_reduc_phis; 2518 auto_vec<gimple *, 4> double_reduc_stmts; 2519 2520 if (!stmt_vec_info_vec.exists ()) 2521 init_stmt_vec_info_vec (); 2522 2523 simple_loop_info = vect_analyze_loop_form (loop); 2524 if (simple_loop_info == NULL) 2525 goto gather_done; 2526 2527 for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi)) 2528 { 2529 gphi *phi = gsi.phi (); 2530 affine_iv iv; 2531 tree res = PHI_RESULT (phi); 2532 bool double_reduc; 2533 2534 if (virtual_operand_p (res)) 2535 continue; 2536 2537 if (simple_iv (loop, loop, res, &iv, true)) 2538 continue; 2539 2540 gimple *reduc_stmt 2541 = vect_force_simple_reduction (simple_loop_info, phi, true, 2542 &double_reduc, true); 2543 if (!reduc_stmt) 2544 continue; 2545 2546 if (double_reduc) 2547 { 2548 if (loop->inner->inner != NULL) 2549 continue; 2550 2551 double_reduc_phis.safe_push (phi); 2552 double_reduc_stmts.safe_push (reduc_stmt); 2553 continue; 2554 } 2555 2556 build_new_reduction (reduction_list, reduc_stmt, phi); 2557 } 2558 destroy_loop_vec_info (simple_loop_info, true); 2559 2560 if (!double_reduc_phis.is_empty ()) 2561 { 2562 simple_loop_info = vect_analyze_loop_form (loop->inner); 2563 if (simple_loop_info) 2564 { 2565 gphi *phi; 2566 unsigned int i; 2567 2568 FOR_EACH_VEC_ELT (double_reduc_phis, i, phi) 2569 { 2570 affine_iv iv; 2571 tree res = PHI_RESULT (phi); 2572 bool double_reduc; 2573 2574 use_operand_p use_p; 2575 gimple *inner_stmt; 2576 bool single_use_p = single_imm_use (res, &use_p, &inner_stmt); 2577 gcc_assert (single_use_p); 2578 if (gimple_code (inner_stmt) != GIMPLE_PHI) 2579 continue; 2580 gphi *inner_phi = as_a <gphi *> (inner_stmt); 2581 if (simple_iv (loop->inner, loop->inner, PHI_RESULT (inner_phi), 2582 &iv, true)) 2583 continue; 2584 2585 gimple *inner_reduc_stmt 2586 = vect_force_simple_reduction (simple_loop_info, inner_phi, 2587 true, &double_reduc, true); 2588 gcc_assert (!double_reduc); 2589 if (inner_reduc_stmt == NULL) 2590 continue; 2591 2592 build_new_reduction (reduction_list, double_reduc_stmts[i], phi); 2593 } 2594 destroy_loop_vec_info (simple_loop_info, true); 2595 } 2596 } 2597 2598 gather_done: 2599 /* Release the claim on gimple_uid. */ 2600 free_stmt_vec_info_vec (); 2601 2602 if (reduction_list->elements () == 0) 2603 return; 2604 2605 /* As gimple_uid is used by the vectorizer in between vect_analyze_loop_form 2606 and free_stmt_vec_info_vec, we can set gimple_uid of reduc_phi stmts only 2607 now. */ 2608 basic_block bb; 2609 FOR_EACH_BB_FN (bb, cfun) 2610 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi)) 2611 gimple_set_uid (gsi_stmt (gsi), (unsigned int)-1); 2612 reduction_list->traverse <void *, set_reduc_phi_uids> (NULL); 2613 } 2614 2615 /* Try to initialize NITER for code generation part. */ 2616 2617 static bool 2618 try_get_loop_niter (loop_p loop, struct tree_niter_desc *niter) 2619 { 2620 edge exit = single_dom_exit (loop); 2621 2622 gcc_assert (exit); 2623 2624 /* We need to know # of iterations, and there should be no uses of values 2625 defined inside loop outside of it, unless the values are invariants of 2626 the loop. */ 2627 if (!number_of_iterations_exit (loop, exit, niter, false)) 2628 { 2629 if (dump_file && (dump_flags & TDF_DETAILS)) 2630 fprintf (dump_file, " FAILED: number of iterations not known\n"); 2631 return false; 2632 } 2633 2634 return true; 2635 } 2636 2637 /* Return the default def of the first function argument. */ 2638 2639 static tree 2640 get_omp_data_i_param (void) 2641 { 2642 tree decl = DECL_ARGUMENTS (cfun->decl); 2643 gcc_assert (DECL_CHAIN (decl) == NULL_TREE); 2644 return ssa_default_def (cfun, decl); 2645 } 2646 2647 /* For PHI in loop header of LOOP, look for pattern: 2648 2649 <bb preheader> 2650 .omp_data_i = &.omp_data_arr; 2651 addr = .omp_data_i->sum; 2652 sum_a = *addr; 2653 2654 <bb header>: 2655 sum_b = PHI <sum_a (preheader), sum_c (latch)> 2656 2657 and return addr. Otherwise, return NULL_TREE. */ 2658 2659 static tree 2660 find_reduc_addr (struct loop *loop, gphi *phi) 2661 { 2662 edge e = loop_preheader_edge (loop); 2663 tree arg = PHI_ARG_DEF_FROM_EDGE (phi, e); 2664 gimple *stmt = SSA_NAME_DEF_STMT (arg); 2665 if (!gimple_assign_single_p (stmt)) 2666 return NULL_TREE; 2667 tree memref = gimple_assign_rhs1 (stmt); 2668 if (TREE_CODE (memref) != MEM_REF) 2669 return NULL_TREE; 2670 tree addr = TREE_OPERAND (memref, 0); 2671 2672 gimple *stmt2 = SSA_NAME_DEF_STMT (addr); 2673 if (!gimple_assign_single_p (stmt2)) 2674 return NULL_TREE; 2675 tree compref = gimple_assign_rhs1 (stmt2); 2676 if (TREE_CODE (compref) != COMPONENT_REF) 2677 return NULL_TREE; 2678 tree addr2 = TREE_OPERAND (compref, 0); 2679 if (TREE_CODE (addr2) != MEM_REF) 2680 return NULL_TREE; 2681 addr2 = TREE_OPERAND (addr2, 0); 2682 if (TREE_CODE (addr2) != SSA_NAME 2683 || addr2 != get_omp_data_i_param ()) 2684 return NULL_TREE; 2685 2686 return addr; 2687 } 2688 2689 /* Try to initialize REDUCTION_LIST for code generation part. 2690 REDUCTION_LIST describes the reductions. */ 2691 2692 static bool 2693 try_create_reduction_list (loop_p loop, 2694 reduction_info_table_type *reduction_list, 2695 bool oacc_kernels_p) 2696 { 2697 edge exit = single_dom_exit (loop); 2698 gphi_iterator gsi; 2699 2700 gcc_assert (exit); 2701 2702 /* Try to get rid of exit phis. */ 2703 final_value_replacement_loop (loop); 2704 2705 gather_scalar_reductions (loop, reduction_list); 2706 2707 2708 for (gsi = gsi_start_phis (exit->dest); !gsi_end_p (gsi); gsi_next (&gsi)) 2709 { 2710 gphi *phi = gsi.phi (); 2711 struct reduction_info *red; 2712 imm_use_iterator imm_iter; 2713 use_operand_p use_p; 2714 gimple *reduc_phi; 2715 tree val = PHI_ARG_DEF_FROM_EDGE (phi, exit); 2716 2717 if (!virtual_operand_p (val)) 2718 { 2719 if (dump_file && (dump_flags & TDF_DETAILS)) 2720 { 2721 fprintf (dump_file, "phi is "); 2722 print_gimple_stmt (dump_file, phi, 0, 0); 2723 fprintf (dump_file, "arg of phi to exit: value "); 2724 print_generic_expr (dump_file, val, 0); 2725 fprintf (dump_file, " used outside loop\n"); 2726 fprintf (dump_file, 2727 " checking if it is part of reduction pattern:\n"); 2728 } 2729 if (reduction_list->elements () == 0) 2730 { 2731 if (dump_file && (dump_flags & TDF_DETAILS)) 2732 fprintf (dump_file, 2733 " FAILED: it is not a part of reduction.\n"); 2734 return false; 2735 } 2736 reduc_phi = NULL; 2737 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, val) 2738 { 2739 if (!gimple_debug_bind_p (USE_STMT (use_p)) 2740 && flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))) 2741 { 2742 reduc_phi = USE_STMT (use_p); 2743 break; 2744 } 2745 } 2746 red = reduction_phi (reduction_list, reduc_phi); 2747 if (red == NULL) 2748 { 2749 if (dump_file && (dump_flags & TDF_DETAILS)) 2750 fprintf (dump_file, 2751 " FAILED: it is not a part of reduction.\n"); 2752 return false; 2753 } 2754 if (red->keep_res != NULL) 2755 { 2756 if (dump_file && (dump_flags & TDF_DETAILS)) 2757 fprintf (dump_file, 2758 " FAILED: reduction has multiple exit phis.\n"); 2759 return false; 2760 } 2761 red->keep_res = phi; 2762 if (dump_file && (dump_flags & TDF_DETAILS)) 2763 { 2764 fprintf (dump_file, "reduction phi is "); 2765 print_gimple_stmt (dump_file, red->reduc_phi, 0, 0); 2766 fprintf (dump_file, "reduction stmt is "); 2767 print_gimple_stmt (dump_file, red->reduc_stmt, 0, 0); 2768 } 2769 } 2770 } 2771 2772 /* The iterations of the loop may communicate only through bivs whose 2773 iteration space can be distributed efficiently. */ 2774 for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi)) 2775 { 2776 gphi *phi = gsi.phi (); 2777 tree def = PHI_RESULT (phi); 2778 affine_iv iv; 2779 2780 if (!virtual_operand_p (def) && !simple_iv (loop, loop, def, &iv, true)) 2781 { 2782 struct reduction_info *red; 2783 2784 red = reduction_phi (reduction_list, phi); 2785 if (red == NULL) 2786 { 2787 if (dump_file && (dump_flags & TDF_DETAILS)) 2788 fprintf (dump_file, 2789 " FAILED: scalar dependency between iterations\n"); 2790 return false; 2791 } 2792 } 2793 } 2794 2795 if (oacc_kernels_p) 2796 { 2797 for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); 2798 gsi_next (&gsi)) 2799 { 2800 gphi *phi = gsi.phi (); 2801 tree def = PHI_RESULT (phi); 2802 affine_iv iv; 2803 2804 if (!virtual_operand_p (def) 2805 && !simple_iv (loop, loop, def, &iv, true)) 2806 { 2807 tree addr = find_reduc_addr (loop, phi); 2808 if (addr == NULL_TREE) 2809 return false; 2810 struct reduction_info *red = reduction_phi (reduction_list, phi); 2811 red->reduc_addr = addr; 2812 } 2813 } 2814 } 2815 2816 return true; 2817 } 2818 2819 /* Return true if LOOP contains phis with ADDR_EXPR in args. */ 2820 2821 static bool 2822 loop_has_phi_with_address_arg (struct loop *loop) 2823 { 2824 basic_block *bbs = get_loop_body (loop); 2825 bool res = false; 2826 2827 unsigned i, j; 2828 gphi_iterator gsi; 2829 for (i = 0; i < loop->num_nodes; i++) 2830 for (gsi = gsi_start_phis (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi)) 2831 { 2832 gphi *phi = gsi.phi (); 2833 for (j = 0; j < gimple_phi_num_args (phi); j++) 2834 { 2835 tree arg = gimple_phi_arg_def (phi, j); 2836 if (TREE_CODE (arg) == ADDR_EXPR) 2837 { 2838 /* This should be handled by eliminate_local_variables, but that 2839 function currently ignores phis. */ 2840 res = true; 2841 goto end; 2842 } 2843 } 2844 } 2845 end: 2846 free (bbs); 2847 2848 return res; 2849 } 2850 2851 /* Return true if memory ref REF (corresponding to the stmt at GSI in 2852 REGIONS_BB[I]) conflicts with the statements in REGIONS_BB[I] after gsi, 2853 or the statements in REGIONS_BB[I + n]. REF_IS_STORE indicates if REF is a 2854 store. Ignore conflicts with SKIP_STMT. */ 2855 2856 static bool 2857 ref_conflicts_with_region (gimple_stmt_iterator gsi, ao_ref *ref, 2858 bool ref_is_store, vec<basic_block> region_bbs, 2859 unsigned int i, gimple *skip_stmt) 2860 { 2861 basic_block bb = region_bbs[i]; 2862 gsi_next (&gsi); 2863 2864 while (true) 2865 { 2866 for (; !gsi_end_p (gsi); 2867 gsi_next (&gsi)) 2868 { 2869 gimple *stmt = gsi_stmt (gsi); 2870 if (stmt == skip_stmt) 2871 { 2872 if (dump_file) 2873 { 2874 fprintf (dump_file, "skipping reduction store: "); 2875 print_gimple_stmt (dump_file, stmt, 0, 0); 2876 } 2877 continue; 2878 } 2879 2880 if (!gimple_vdef (stmt) 2881 && !gimple_vuse (stmt)) 2882 continue; 2883 2884 if (gimple_code (stmt) == GIMPLE_RETURN) 2885 continue; 2886 2887 if (ref_is_store) 2888 { 2889 if (ref_maybe_used_by_stmt_p (stmt, ref)) 2890 { 2891 if (dump_file) 2892 { 2893 fprintf (dump_file, "Stmt "); 2894 print_gimple_stmt (dump_file, stmt, 0, 0); 2895 } 2896 return true; 2897 } 2898 } 2899 else 2900 { 2901 if (stmt_may_clobber_ref_p_1 (stmt, ref)) 2902 { 2903 if (dump_file) 2904 { 2905 fprintf (dump_file, "Stmt "); 2906 print_gimple_stmt (dump_file, stmt, 0, 0); 2907 } 2908 return true; 2909 } 2910 } 2911 } 2912 i++; 2913 if (i == region_bbs.length ()) 2914 break; 2915 bb = region_bbs[i]; 2916 gsi = gsi_start_bb (bb); 2917 } 2918 2919 return false; 2920 } 2921 2922 /* Return true if the bbs in REGION_BBS but not in in_loop_bbs can be executed 2923 in parallel with REGION_BBS containing the loop. Return the stores of 2924 reduction results in REDUCTION_STORES. */ 2925 2926 static bool 2927 oacc_entry_exit_ok_1 (bitmap in_loop_bbs, vec<basic_block> region_bbs, 2928 reduction_info_table_type *reduction_list, 2929 bitmap reduction_stores) 2930 { 2931 tree omp_data_i = get_omp_data_i_param (); 2932 2933 unsigned i; 2934 basic_block bb; 2935 FOR_EACH_VEC_ELT (region_bbs, i, bb) 2936 { 2937 if (bitmap_bit_p (in_loop_bbs, bb->index)) 2938 continue; 2939 2940 gimple_stmt_iterator gsi; 2941 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); 2942 gsi_next (&gsi)) 2943 { 2944 gimple *stmt = gsi_stmt (gsi); 2945 gimple *skip_stmt = NULL; 2946 2947 if (is_gimple_debug (stmt) 2948 || gimple_code (stmt) == GIMPLE_COND) 2949 continue; 2950 2951 ao_ref ref; 2952 bool ref_is_store = false; 2953 if (gimple_assign_load_p (stmt)) 2954 { 2955 tree rhs = gimple_assign_rhs1 (stmt); 2956 tree base = get_base_address (rhs); 2957 if (TREE_CODE (base) == MEM_REF 2958 && operand_equal_p (TREE_OPERAND (base, 0), omp_data_i, 0)) 2959 continue; 2960 2961 tree lhs = gimple_assign_lhs (stmt); 2962 if (TREE_CODE (lhs) == SSA_NAME 2963 && has_single_use (lhs)) 2964 { 2965 use_operand_p use_p; 2966 gimple *use_stmt; 2967 single_imm_use (lhs, &use_p, &use_stmt); 2968 if (gimple_code (use_stmt) == GIMPLE_PHI) 2969 { 2970 struct reduction_info *red; 2971 red = reduction_phi (reduction_list, use_stmt); 2972 tree val = PHI_RESULT (red->keep_res); 2973 if (has_single_use (val)) 2974 { 2975 single_imm_use (val, &use_p, &use_stmt); 2976 if (gimple_store_p (use_stmt)) 2977 { 2978 unsigned int id 2979 = SSA_NAME_VERSION (gimple_vdef (use_stmt)); 2980 bitmap_set_bit (reduction_stores, id); 2981 skip_stmt = use_stmt; 2982 if (dump_file) 2983 { 2984 fprintf (dump_file, "found reduction load: "); 2985 print_gimple_stmt (dump_file, stmt, 0, 0); 2986 } 2987 } 2988 } 2989 } 2990 } 2991 2992 ao_ref_init (&ref, rhs); 2993 } 2994 else if (gimple_store_p (stmt)) 2995 { 2996 ao_ref_init (&ref, gimple_assign_lhs (stmt)); 2997 ref_is_store = true; 2998 } 2999 else if (gimple_code (stmt) == GIMPLE_OMP_RETURN) 3000 continue; 3001 else if (!gimple_has_side_effects (stmt) 3002 && !gimple_could_trap_p (stmt) 3003 && !stmt_could_throw_p (stmt) 3004 && !gimple_vdef (stmt) 3005 && !gimple_vuse (stmt)) 3006 continue; 3007 else if (gimple_call_internal_p (stmt, IFN_GOACC_DIM_POS)) 3008 continue; 3009 else if (gimple_code (stmt) == GIMPLE_RETURN) 3010 continue; 3011 else 3012 { 3013 if (dump_file) 3014 { 3015 fprintf (dump_file, "Unhandled stmt in entry/exit: "); 3016 print_gimple_stmt (dump_file, stmt, 0, 0); 3017 } 3018 return false; 3019 } 3020 3021 if (ref_conflicts_with_region (gsi, &ref, ref_is_store, region_bbs, 3022 i, skip_stmt)) 3023 { 3024 if (dump_file) 3025 { 3026 fprintf (dump_file, "conflicts with entry/exit stmt: "); 3027 print_gimple_stmt (dump_file, stmt, 0, 0); 3028 } 3029 return false; 3030 } 3031 } 3032 } 3033 3034 return true; 3035 } 3036 3037 /* Find stores inside REGION_BBS and outside IN_LOOP_BBS, and guard them with 3038 gang_pos == 0, except when the stores are REDUCTION_STORES. Return true 3039 if any changes were made. */ 3040 3041 static bool 3042 oacc_entry_exit_single_gang (bitmap in_loop_bbs, vec<basic_block> region_bbs, 3043 bitmap reduction_stores) 3044 { 3045 tree gang_pos = NULL_TREE; 3046 bool changed = false; 3047 3048 unsigned i; 3049 basic_block bb; 3050 FOR_EACH_VEC_ELT (region_bbs, i, bb) 3051 { 3052 if (bitmap_bit_p (in_loop_bbs, bb->index)) 3053 continue; 3054 3055 gimple_stmt_iterator gsi; 3056 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);) 3057 { 3058 gimple *stmt = gsi_stmt (gsi); 3059 3060 if (!gimple_store_p (stmt)) 3061 { 3062 /* Update gsi to point to next stmt. */ 3063 gsi_next (&gsi); 3064 continue; 3065 } 3066 3067 if (bitmap_bit_p (reduction_stores, 3068 SSA_NAME_VERSION (gimple_vdef (stmt)))) 3069 { 3070 if (dump_file) 3071 { 3072 fprintf (dump_file, 3073 "skipped reduction store for single-gang" 3074 " neutering: "); 3075 print_gimple_stmt (dump_file, stmt, 0, 0); 3076 } 3077 3078 /* Update gsi to point to next stmt. */ 3079 gsi_next (&gsi); 3080 continue; 3081 } 3082 3083 changed = true; 3084 3085 if (gang_pos == NULL_TREE) 3086 { 3087 tree arg = build_int_cst (integer_type_node, GOMP_DIM_GANG); 3088 gcall *gang_single 3089 = gimple_build_call_internal (IFN_GOACC_DIM_POS, 1, arg); 3090 gang_pos = make_ssa_name (integer_type_node); 3091 gimple_call_set_lhs (gang_single, gang_pos); 3092 gimple_stmt_iterator start 3093 = gsi_start_bb (single_succ (ENTRY_BLOCK_PTR_FOR_FN (cfun))); 3094 tree vuse = ssa_default_def (cfun, gimple_vop (cfun)); 3095 gimple_set_vuse (gang_single, vuse); 3096 gsi_insert_before (&start, gang_single, GSI_SAME_STMT); 3097 } 3098 3099 if (dump_file) 3100 { 3101 fprintf (dump_file, 3102 "found store that needs single-gang neutering: "); 3103 print_gimple_stmt (dump_file, stmt, 0, 0); 3104 } 3105 3106 { 3107 /* Split block before store. */ 3108 gimple_stmt_iterator gsi2 = gsi; 3109 gsi_prev (&gsi2); 3110 edge e; 3111 if (gsi_end_p (gsi2)) 3112 { 3113 e = split_block_after_labels (bb); 3114 gsi2 = gsi_last_bb (bb); 3115 } 3116 else 3117 e = split_block (bb, gsi_stmt (gsi2)); 3118 basic_block bb2 = e->dest; 3119 3120 /* Split block after store. */ 3121 gimple_stmt_iterator gsi3 = gsi_start_bb (bb2); 3122 edge e2 = split_block (bb2, gsi_stmt (gsi3)); 3123 basic_block bb3 = e2->dest; 3124 3125 gimple *cond 3126 = gimple_build_cond (EQ_EXPR, gang_pos, integer_zero_node, 3127 NULL_TREE, NULL_TREE); 3128 gsi_insert_after (&gsi2, cond, GSI_NEW_STMT); 3129 3130 edge e3 = make_edge (bb, bb3, EDGE_FALSE_VALUE); 3131 e->flags = EDGE_TRUE_VALUE; 3132 3133 tree vdef = gimple_vdef (stmt); 3134 tree vuse = gimple_vuse (stmt); 3135 3136 tree phi_res = copy_ssa_name (vdef); 3137 gphi *new_phi = create_phi_node (phi_res, bb3); 3138 replace_uses_by (vdef, phi_res); 3139 add_phi_arg (new_phi, vuse, e3, UNKNOWN_LOCATION); 3140 add_phi_arg (new_phi, vdef, e2, UNKNOWN_LOCATION); 3141 3142 /* Update gsi to point to next stmt. */ 3143 bb = bb3; 3144 gsi = gsi_start_bb (bb); 3145 } 3146 } 3147 } 3148 3149 return changed; 3150 } 3151 3152 /* Return true if the statements before and after the LOOP can be executed in 3153 parallel with the function containing the loop. Resolve conflicting stores 3154 outside LOOP by guarding them such that only a single gang executes them. */ 3155 3156 static bool 3157 oacc_entry_exit_ok (struct loop *loop, 3158 reduction_info_table_type *reduction_list) 3159 { 3160 basic_block *loop_bbs = get_loop_body_in_dom_order (loop); 3161 vec<basic_block> region_bbs 3162 = get_all_dominated_blocks (CDI_DOMINATORS, ENTRY_BLOCK_PTR_FOR_FN (cfun)); 3163 3164 bitmap in_loop_bbs = BITMAP_ALLOC (NULL); 3165 bitmap_clear (in_loop_bbs); 3166 for (unsigned int i = 0; i < loop->num_nodes; i++) 3167 bitmap_set_bit (in_loop_bbs, loop_bbs[i]->index); 3168 3169 bitmap reduction_stores = BITMAP_ALLOC (NULL); 3170 bool res = oacc_entry_exit_ok_1 (in_loop_bbs, region_bbs, reduction_list, 3171 reduction_stores); 3172 3173 if (res) 3174 { 3175 bool changed = oacc_entry_exit_single_gang (in_loop_bbs, region_bbs, 3176 reduction_stores); 3177 if (changed) 3178 { 3179 free_dominance_info (CDI_DOMINATORS); 3180 calculate_dominance_info (CDI_DOMINATORS); 3181 } 3182 } 3183 3184 region_bbs.release (); 3185 free (loop_bbs); 3186 3187 BITMAP_FREE (in_loop_bbs); 3188 BITMAP_FREE (reduction_stores); 3189 3190 return res; 3191 } 3192 3193 /* Detect parallel loops and generate parallel code using libgomp 3194 primitives. Returns true if some loop was parallelized, false 3195 otherwise. */ 3196 3197 static bool 3198 parallelize_loops (bool oacc_kernels_p) 3199 { 3200 unsigned n_threads; 3201 bool changed = false; 3202 struct loop *loop; 3203 struct loop *skip_loop = NULL; 3204 struct tree_niter_desc niter_desc; 3205 struct obstack parloop_obstack; 3206 HOST_WIDE_INT estimated; 3207 source_location loop_loc; 3208 3209 /* Do not parallelize loops in the functions created by parallelization. */ 3210 if (!oacc_kernels_p 3211 && parallelized_function_p (cfun->decl)) 3212 return false; 3213 3214 /* Do not parallelize loops in offloaded functions. */ 3215 if (!oacc_kernels_p 3216 && oacc_get_fn_attrib (cfun->decl) != NULL) 3217 return false; 3218 3219 if (cfun->has_nonlocal_label) 3220 return false; 3221 3222 /* For OpenACC kernels, n_threads will be determined later; otherwise, it's 3223 the argument to -ftree-parallelize-loops. */ 3224 if (oacc_kernels_p) 3225 n_threads = 0; 3226 else 3227 n_threads = flag_tree_parallelize_loops; 3228 3229 gcc_obstack_init (&parloop_obstack); 3230 reduction_info_table_type reduction_list (10); 3231 3232 calculate_dominance_info (CDI_DOMINATORS); 3233 3234 FOR_EACH_LOOP (loop, 0) 3235 { 3236 if (loop == skip_loop) 3237 { 3238 if (!loop->in_oacc_kernels_region 3239 && dump_file && (dump_flags & TDF_DETAILS)) 3240 fprintf (dump_file, 3241 "Skipping loop %d as inner loop of parallelized loop\n", 3242 loop->num); 3243 3244 skip_loop = loop->inner; 3245 continue; 3246 } 3247 else 3248 skip_loop = NULL; 3249 3250 reduction_list.empty (); 3251 3252 if (oacc_kernels_p) 3253 { 3254 if (!loop->in_oacc_kernels_region) 3255 continue; 3256 3257 /* Don't try to parallelize inner loops in an oacc kernels region. */ 3258 if (loop->inner) 3259 skip_loop = loop->inner; 3260 3261 if (dump_file && (dump_flags & TDF_DETAILS)) 3262 fprintf (dump_file, 3263 "Trying loop %d with header bb %d in oacc kernels" 3264 " region\n", loop->num, loop->header->index); 3265 } 3266 3267 if (dump_file && (dump_flags & TDF_DETAILS)) 3268 { 3269 fprintf (dump_file, "Trying loop %d as candidate\n",loop->num); 3270 if (loop->inner) 3271 fprintf (dump_file, "loop %d is not innermost\n",loop->num); 3272 else 3273 fprintf (dump_file, "loop %d is innermost\n",loop->num); 3274 } 3275 3276 /* If we use autopar in graphite pass, we use its marked dependency 3277 checking results. */ 3278 if (flag_loop_parallelize_all && !loop->can_be_parallel) 3279 { 3280 if (dump_file && (dump_flags & TDF_DETAILS)) 3281 fprintf (dump_file, "loop is not parallel according to graphite\n"); 3282 continue; 3283 } 3284 3285 if (!single_dom_exit (loop)) 3286 { 3287 3288 if (dump_file && (dump_flags & TDF_DETAILS)) 3289 fprintf (dump_file, "loop is !single_dom_exit\n"); 3290 3291 continue; 3292 } 3293 3294 if (/* And of course, the loop must be parallelizable. */ 3295 !can_duplicate_loop_p (loop) 3296 || loop_has_blocks_with_irreducible_flag (loop) 3297 || (loop_preheader_edge (loop)->src->flags & BB_IRREDUCIBLE_LOOP) 3298 /* FIXME: the check for vector phi nodes could be removed. */ 3299 || loop_has_vector_phi_nodes (loop)) 3300 continue; 3301 3302 estimated = estimated_stmt_executions_int (loop); 3303 if (estimated == -1) 3304 estimated = likely_max_stmt_executions_int (loop); 3305 /* FIXME: Bypass this check as graphite doesn't update the 3306 count and frequency correctly now. */ 3307 if (!flag_loop_parallelize_all 3308 && !oacc_kernels_p 3309 && ((estimated != -1 3310 && estimated <= (HOST_WIDE_INT) n_threads * MIN_PER_THREAD) 3311 /* Do not bother with loops in cold areas. */ 3312 || optimize_loop_nest_for_size_p (loop))) 3313 continue; 3314 3315 if (!try_get_loop_niter (loop, &niter_desc)) 3316 continue; 3317 3318 if (!try_create_reduction_list (loop, &reduction_list, oacc_kernels_p)) 3319 continue; 3320 3321 if (loop_has_phi_with_address_arg (loop)) 3322 continue; 3323 3324 if (!flag_loop_parallelize_all 3325 && !loop_parallel_p (loop, &parloop_obstack)) 3326 continue; 3327 3328 if (oacc_kernels_p 3329 && !oacc_entry_exit_ok (loop, &reduction_list)) 3330 { 3331 if (dump_file) 3332 fprintf (dump_file, "entry/exit not ok: FAILED\n"); 3333 continue; 3334 } 3335 3336 changed = true; 3337 skip_loop = loop->inner; 3338 3339 loop_loc = find_loop_location (loop); 3340 if (loop->inner) 3341 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loop_loc, 3342 "parallelizing outer loop %d\n", loop->num); 3343 else 3344 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loop_loc, 3345 "parallelizing inner loop %d\n", loop->num); 3346 3347 gen_parallel_loop (loop, &reduction_list, 3348 n_threads, &niter_desc, oacc_kernels_p); 3349 } 3350 3351 obstack_free (&parloop_obstack, NULL); 3352 3353 /* Parallelization will cause new function calls to be inserted through 3354 which local variables will escape. Reset the points-to solution 3355 for ESCAPED. */ 3356 if (changed) 3357 pt_solution_reset (&cfun->gimple_df->escaped); 3358 3359 return changed; 3360 } 3361 3362 /* Parallelization. */ 3363 3364 namespace { 3365 3366 const pass_data pass_data_parallelize_loops = 3367 { 3368 GIMPLE_PASS, /* type */ 3369 "parloops", /* name */ 3370 OPTGROUP_LOOP, /* optinfo_flags */ 3371 TV_TREE_PARALLELIZE_LOOPS, /* tv_id */ 3372 ( PROP_cfg | PROP_ssa ), /* properties_required */ 3373 0, /* properties_provided */ 3374 0, /* properties_destroyed */ 3375 0, /* todo_flags_start */ 3376 0, /* todo_flags_finish */ 3377 }; 3378 3379 class pass_parallelize_loops : public gimple_opt_pass 3380 { 3381 public: 3382 pass_parallelize_loops (gcc::context *ctxt) 3383 : gimple_opt_pass (pass_data_parallelize_loops, ctxt), 3384 oacc_kernels_p (false) 3385 {} 3386 3387 /* opt_pass methods: */ 3388 virtual bool gate (function *) 3389 { 3390 if (oacc_kernels_p) 3391 return flag_openacc; 3392 else 3393 return flag_tree_parallelize_loops > 1; 3394 } 3395 virtual unsigned int execute (function *); 3396 opt_pass * clone () { return new pass_parallelize_loops (m_ctxt); } 3397 void set_pass_param (unsigned int n, bool param) 3398 { 3399 gcc_assert (n == 0); 3400 oacc_kernels_p = param; 3401 } 3402 3403 private: 3404 bool oacc_kernels_p; 3405 }; // class pass_parallelize_loops 3406 3407 unsigned 3408 pass_parallelize_loops::execute (function *fun) 3409 { 3410 tree nthreads = builtin_decl_explicit (BUILT_IN_OMP_GET_NUM_THREADS); 3411 if (nthreads == NULL_TREE) 3412 return 0; 3413 3414 bool in_loop_pipeline = scev_initialized_p (); 3415 if (!in_loop_pipeline) 3416 loop_optimizer_init (LOOPS_NORMAL 3417 | LOOPS_HAVE_RECORDED_EXITS); 3418 3419 if (number_of_loops (fun) <= 1) 3420 return 0; 3421 3422 if (!in_loop_pipeline) 3423 { 3424 rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa); 3425 scev_initialize (); 3426 } 3427 3428 unsigned int todo = 0; 3429 if (parallelize_loops (oacc_kernels_p)) 3430 { 3431 fun->curr_properties &= ~(PROP_gimple_eomp); 3432 3433 checking_verify_loop_structure (); 3434 3435 todo |= TODO_update_ssa; 3436 } 3437 3438 if (!in_loop_pipeline) 3439 { 3440 scev_finalize (); 3441 loop_optimizer_finalize (); 3442 } 3443 3444 return todo; 3445 } 3446 3447 } // anon namespace 3448 3449 gimple_opt_pass * 3450 make_pass_parallelize_loops (gcc::context *ctxt) 3451 { 3452 return new pass_parallelize_loops (ctxt); 3453 } 3454