1 /* Loop unroll-and-jam. 2 Copyright (C) 2017-2019 Free Software Foundation, Inc. 3 4 This file is part of GCC. 5 6 GCC is free software; you can redistribute it and/or modify it 7 under the terms of the GNU General Public License as published by the 8 Free Software Foundation; either version 3, or (at your option) any 9 later version. 10 11 GCC is distributed in the hope that it will be useful, but WITHOUT 12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 for more details. 15 16 You should have received a copy of the GNU General Public License 17 along with GCC; see the file COPYING3. If not see 18 <http://www.gnu.org/licenses/>. */ 19 20 #include "config.h" 21 #include "system.h" 22 #include "coretypes.h" 23 #include "params.h" 24 #include "tree-pass.h" 25 #include "backend.h" 26 #include "tree.h" 27 #include "gimple.h" 28 #include "ssa.h" 29 #include "fold-const.h" 30 #include "tree-cfg.h" 31 #include "tree-ssa.h" 32 #include "tree-ssa-loop-niter.h" 33 #include "tree-ssa-loop.h" 34 #include "tree-ssa-loop-manip.h" 35 #include "cfgloop.h" 36 #include "tree-scalar-evolution.h" 37 #include "gimple-iterator.h" 38 #include "cfghooks.h" 39 #include "tree-data-ref.h" 40 #include "tree-ssa-loop-ivopts.h" 41 #include "tree-vectorizer.h" 42 43 /* Unroll and Jam transformation 44 45 This is a combination of two transformations, where the second 46 is not always valid. It's applicable if a loop nest has redundancies 47 over the iterations of an outer loop while not having that with 48 an inner loop. 49 50 Given this nest: 51 for (i) { 52 for (j) { 53 B(i,j) 54 } 55 } 56 57 first unroll: 58 for (i by 2) { 59 for (j) { 60 B(i,j) 61 } 62 for (j) { 63 B(i+1,j) 64 } 65 } 66 67 then fuse the two adjacent inner loops resulting from that: 68 for (i by 2) { 69 for (j) { 70 B(i,j) 71 B(i+1,j) 72 } 73 } 74 75 As the order of evaluations of the body B changes this is valid 76 only in certain situations: all distance vectors need to be forward. 77 Additionally if there are multiple induction variables than just 78 a counting control IV (j above) we can also deal with some situations. 79 80 The validity is checked by unroll_jam_possible_p, and the data-dep 81 testing below. 82 83 A trivial example where the fusion is wrong would be when 84 B(i,j) == x[j-1] = x[j]; 85 for (i by 2) { 86 for (j) { 87 x[j-1] = x[j]; 88 } 89 for (j) { 90 x[j-1] = x[j]; 91 } 92 } effect: move content to front by two elements 93 --> 94 for (i by 2) { 95 for (j) { 96 x[j-1] = x[j]; 97 x[j-1] = x[j]; 98 } 99 } effect: move content to front by one element 100 */ 101 102 /* Modify the loop tree for the fact that all code once belonging 103 to the OLD loop or the outer loop of OLD now is inside LOOP. */ 104 105 static void 106 merge_loop_tree (struct loop *loop, struct loop *old) 107 { 108 basic_block *bbs; 109 int i, n; 110 struct loop *subloop; 111 edge e; 112 edge_iterator ei; 113 114 /* Find its nodes. */ 115 bbs = XNEWVEC (basic_block, n_basic_blocks_for_fn (cfun)); 116 n = get_loop_body_with_size (loop, bbs, n_basic_blocks_for_fn (cfun)); 117 118 for (i = 0; i < n; i++) 119 { 120 /* If the block was direct child of OLD loop it's now part 121 of LOOP. If it was outside OLD, then it moved into LOOP 122 as well. This avoids changing the loop father for BBs 123 in inner loops of OLD. */ 124 if (bbs[i]->loop_father == old 125 || loop_depth (bbs[i]->loop_father) < loop_depth (old)) 126 { 127 remove_bb_from_loops (bbs[i]); 128 add_bb_to_loop (bbs[i], loop); 129 continue; 130 } 131 132 /* If we find a direct subloop of OLD, move it to LOOP. */ 133 subloop = bbs[i]->loop_father; 134 if (loop_outer (subloop) == old && subloop->header == bbs[i]) 135 { 136 flow_loop_tree_node_remove (subloop); 137 flow_loop_tree_node_add (loop, subloop); 138 } 139 } 140 141 /* Update the information about loop exit edges. */ 142 for (i = 0; i < n; i++) 143 { 144 FOR_EACH_EDGE (e, ei, bbs[i]->succs) 145 { 146 rescan_loop_exit (e, false, false); 147 } 148 } 149 150 loop->num_nodes = n; 151 152 free (bbs); 153 } 154 155 /* BB is part of the outer loop of an unroll-and-jam situation. 156 Check if any statements therein would prevent the transformation. */ 157 158 static bool 159 bb_prevents_fusion_p (basic_block bb) 160 { 161 gimple_stmt_iterator gsi; 162 /* BB is duplicated by outer unrolling and then all N-1 first copies 163 move into the body of the fused inner loop. If BB exits the outer loop 164 the last copy still does so, and the first N-1 copies are cancelled 165 by loop unrolling, so also after fusion it's the exit block. 166 But there might be other reasons that prevent fusion: 167 * stores or unknown side-effects prevent fusion 168 * loads don't 169 * computations into SSA names: these aren't problematic. Their 170 result will be unused on the exit edges of the first N-1 copies 171 (those aren't taken after unrolling). If they are used on the 172 other edge (the one leading to the outer latch block) they are 173 loop-carried (on the outer loop) and the Nth copy of BB will 174 compute them again (i.e. the first N-1 copies will be dead). */ 175 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi)) 176 { 177 gimple *g = gsi_stmt (gsi); 178 if (gimple_vdef (g) || gimple_has_side_effects (g)) 179 return true; 180 } 181 return false; 182 } 183 184 /* Given an inner loop LOOP (of some OUTER loop) determine if 185 we can safely fuse copies of it (generated by outer unrolling). 186 If so return true, otherwise return false. */ 187 188 static bool 189 unroll_jam_possible_p (struct loop *outer, struct loop *loop) 190 { 191 basic_block *bbs; 192 int i, n; 193 struct tree_niter_desc niter; 194 195 /* When fusing the loops we skip the latch block 196 of the first one, so it mustn't have any effects to 197 preserve. */ 198 if (!empty_block_p (loop->latch)) 199 return false; 200 201 if (!single_exit (loop)) 202 return false; 203 204 /* We need a perfect nest. Quick check for adjacent inner loops. */ 205 if (outer->inner != loop || loop->next) 206 return false; 207 208 /* Prevent head-controlled inner loops, that we usually have. 209 The guard block would need to be accepted 210 (invariant condition either entering or skipping the loop), 211 without also accepting arbitrary control flow. When unswitching 212 ran before us (as with -O3) this won't be a problem because its 213 outer loop unswitching will have moved out the invariant condition. 214 215 If we do that we need to extend fuse_loops() to cope with this 216 by threading through the (still invariant) copied condition 217 between the two loop copies. */ 218 if (!dominated_by_p (CDI_DOMINATORS, outer->latch, loop->header)) 219 return false; 220 221 /* The number of iterations of the inner loop must be loop invariant 222 with respect to the outer loop. */ 223 if (!number_of_iterations_exit (loop, single_exit (loop), &niter, 224 false, true) 225 || niter.cmp == ERROR_MARK 226 || !integer_zerop (niter.may_be_zero) 227 || !expr_invariant_in_loop_p (outer, niter.niter)) 228 return false; 229 230 /* If the inner loop produces any values that are used inside the 231 outer loop (except the virtual op) then it can flow 232 back (perhaps indirectly) into the inner loop. This prevents 233 fusion: without fusion the value at the last iteration is used, 234 with fusion the value after the initial iteration is used. 235 236 If all uses are outside the outer loop this doesn't prevent fusion; 237 the value of the last iteration is still used (and the values from 238 all intermediate iterations are dead). */ 239 gphi_iterator psi; 240 for (psi = gsi_start_phis (single_exit (loop)->dest); 241 !gsi_end_p (psi); gsi_next (&psi)) 242 { 243 imm_use_iterator imm_iter; 244 use_operand_p use_p; 245 tree op = gimple_phi_result (psi.phi ()); 246 if (virtual_operand_p (op)) 247 continue; 248 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, op) 249 { 250 gimple *use_stmt = USE_STMT (use_p); 251 if (!is_gimple_debug (use_stmt) 252 && flow_bb_inside_loop_p (outer, gimple_bb (use_stmt))) 253 return false; 254 } 255 } 256 257 /* And check blocks belonging to just outer loop. */ 258 bbs = XNEWVEC (basic_block, n_basic_blocks_for_fn (cfun)); 259 n = get_loop_body_with_size (outer, bbs, n_basic_blocks_for_fn (cfun)); 260 261 for (i = 0; i < n; i++) 262 if (bbs[i]->loop_father == outer && bb_prevents_fusion_p (bbs[i])) 263 break; 264 free (bbs); 265 if (i != n) 266 return false; 267 268 /* For now we can safely fuse copies of LOOP only if all 269 loop carried variables are inductions (or the virtual op). 270 271 We could handle reductions as well (the initial value in the second 272 body would be the after-iter value of the first body) if it's over 273 an associative and commutative operation. We wouldn't 274 be able to handle unknown cycles. */ 275 for (psi = gsi_start_phis (loop->header); !gsi_end_p (psi); gsi_next (&psi)) 276 { 277 affine_iv iv; 278 tree op = gimple_phi_result (psi.phi ()); 279 280 if (virtual_operand_p (op)) 281 continue; 282 if (!simple_iv (loop, loop, op, &iv, true)) 283 return false; 284 /* The inductions must be regular, loop invariant step and initial 285 value. */ 286 if (!expr_invariant_in_loop_p (outer, iv.step) 287 || !expr_invariant_in_loop_p (outer, iv.base)) 288 return false; 289 /* XXX With more effort we could also be able to deal with inductions 290 where the initial value is loop variant but a simple IV in the 291 outer loop. The initial value for the second body would be 292 the original initial value plus iv.base.step. The next value 293 for the fused loop would be the original next value of the first 294 copy, _not_ the next value of the second body. */ 295 } 296 297 return true; 298 } 299 300 /* Fuse LOOP with all further neighbors. The loops are expected to 301 be in appropriate form. */ 302 303 static void 304 fuse_loops (struct loop *loop) 305 { 306 struct loop *next = loop->next; 307 308 while (next) 309 { 310 edge e; 311 312 remove_branch (single_pred_edge (loop->latch)); 313 /* Make delete_basic_block not fiddle with the loop structure. */ 314 basic_block oldlatch = loop->latch; 315 loop->latch = NULL; 316 delete_basic_block (oldlatch); 317 e = redirect_edge_and_branch (loop_latch_edge (next), 318 loop->header); 319 loop->latch = e->src; 320 flush_pending_stmts (e); 321 322 gcc_assert (EDGE_COUNT (next->header->preds) == 1); 323 324 /* The PHI nodes of the second body (single-argument now) 325 need adjustments to use the right values: either directly 326 the value of the corresponding PHI in the first copy or 327 the one leaving the first body which unrolling did for us. 328 329 See also unroll_jam_possible_p() for further possibilities. */ 330 gphi_iterator psi_first, psi_second; 331 e = single_pred_edge (next->header); 332 for (psi_first = gsi_start_phis (loop->header), 333 psi_second = gsi_start_phis (next->header); 334 !gsi_end_p (psi_first); 335 gsi_next (&psi_first), gsi_next (&psi_second)) 336 { 337 gphi *phi_first = psi_first.phi (); 338 gphi *phi_second = psi_second.phi (); 339 tree firstop = gimple_phi_result (phi_first); 340 /* The virtual operand is correct already as it's 341 always live at exit, hence has a LCSSA node and outer 342 loop unrolling updated SSA form. */ 343 if (virtual_operand_p (firstop)) 344 continue; 345 346 /* Due to unroll_jam_possible_p() we know that this is 347 an induction. The second body goes over the same 348 iteration space. */ 349 add_phi_arg (phi_second, firstop, e, 350 gimple_location (phi_first)); 351 } 352 gcc_assert (gsi_end_p (psi_second)); 353 354 merge_loop_tree (loop, next); 355 gcc_assert (!next->num_nodes); 356 struct loop *ln = next->next; 357 delete_loop (next); 358 next = ln; 359 } 360 rewrite_into_loop_closed_ssa_1 (NULL, 0, SSA_OP_USE, loop); 361 } 362 363 /* Return true if any of the access functions for dataref A 364 isn't invariant with respect to loop LOOP_NEST. */ 365 static bool 366 any_access_function_variant_p (const struct data_reference *a, 367 const class loop *loop_nest) 368 { 369 unsigned int i; 370 vec<tree> fns = DR_ACCESS_FNS (a); 371 tree t; 372 373 FOR_EACH_VEC_ELT (fns, i, t) 374 if (!evolution_function_is_invariant_p (t, loop_nest->num)) 375 return true; 376 377 return false; 378 } 379 380 /* Returns true if the distance in DDR can be determined and adjusts 381 the unroll factor in *UNROLL to make unrolling valid for that distance. 382 Otherwise return false. DDR is with respect to the outer loop of INNER. 383 384 If this data dep can lead to a removed memory reference, increment 385 *REMOVED and adjust *PROFIT_UNROLL to be the necessary unroll factor 386 for this to happen. */ 387 388 static bool 389 adjust_unroll_factor (class loop *inner, struct data_dependence_relation *ddr, 390 unsigned *unroll, unsigned *profit_unroll, 391 unsigned *removed) 392 { 393 bool ret = false; 394 if (DDR_ARE_DEPENDENT (ddr) != chrec_known) 395 { 396 if (DDR_NUM_DIST_VECTS (ddr) == 0) 397 return false; 398 unsigned i; 399 lambda_vector dist_v; 400 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v) 401 { 402 /* A distance (a,b) is at worst transformed into (a/N,b) by the 403 unrolling (factor N), so the transformation is valid if 404 a >= N, or b > 0, or b is zero and a > 0. Otherwise the unroll 405 factor needs to be limited so that the first condition holds. 406 That may limit the factor down to zero in the worst case. */ 407 int dist = dist_v[0]; 408 if (dist < 0) 409 gcc_unreachable (); 410 else if ((unsigned)dist >= *unroll) 411 ; 412 else if (lambda_vector_zerop (dist_v + 1, DDR_NB_LOOPS (ddr) - 1)) 413 { 414 /* We have (a,0) with a < N, so this will be transformed into 415 (0,0) after unrolling by N. This might potentially be a 416 problem, if it's not a read-read dependency. */ 417 if (DR_IS_READ (DDR_A (ddr)) && DR_IS_READ (DDR_B (ddr))) 418 ; 419 else 420 { 421 /* So, at least one is a write, and we might reduce the 422 distance vector to (0,0). This is still no problem 423 if both data-refs are affine with respect to the inner 424 loops. But if one of them is invariant with respect 425 to an inner loop our reordering implicit in loop fusion 426 corrupts the program, as our data dependences don't 427 capture this. E.g. for: 428 for (0 <= i < n) 429 for (0 <= j < m) 430 a[i][0] = a[i+1][0] + 2; // (1) 431 b[i][j] = b[i+1][j] + 2; // (2) 432 the distance vector for both statements is (-1,0), 433 but exchanging the order for (2) is okay, while 434 for (1) it is not. To see this, write out the original 435 accesses (assume m is 2): 436 a i j original 437 0 0 0 r a[1][0] b[1][0] 438 1 0 0 w a[0][0] b[0][0] 439 2 0 1 r a[1][0] b[1][1] 440 3 0 1 w a[0][0] b[0][1] 441 4 1 0 r a[2][0] b[2][0] 442 5 1 0 w a[1][0] b[1][0] 443 after unroll-by-2 and fusion the accesses are done in 444 this order (from column a): 0,1, 4,5, 2,3, i.e. this: 445 a i j transformed 446 0 0 0 r a[1][0] b[1][0] 447 1 0 0 w a[0][0] b[0][0] 448 4 1 0 r a[2][0] b[2][0] 449 5 1 0 w a[1][0] b[1][0] 450 2 0 1 r a[1][0] b[1][1] 451 3 0 1 w a[0][0] b[0][1] 452 Note how access 2 accesses the same element as access 5 453 for array 'a' but not for array 'b'. */ 454 if (any_access_function_variant_p (DDR_A (ddr), inner) 455 && any_access_function_variant_p (DDR_B (ddr), inner)) 456 ; 457 else 458 /* And if any dataref of this pair is invariant with 459 respect to the inner loop, we have no chance than 460 to reduce the unroll factor. */ 461 *unroll = dist; 462 } 463 } 464 else if (lambda_vector_lexico_pos (dist_v + 1, DDR_NB_LOOPS (ddr) - 1)) 465 ; 466 else 467 *unroll = dist; 468 469 /* With a distance (a,0) it's always profitable to unroll-and-jam 470 (by a+1), because one memory reference will go away. With 471 (a,b) and b != 0 that's less clear. We will increase the 472 number of streams without lowering the number of mem refs. 473 So for now only handle the first situation. */ 474 if (lambda_vector_zerop (dist_v + 1, DDR_NB_LOOPS (ddr) - 1)) 475 { 476 *profit_unroll = MAX (*profit_unroll, (unsigned)dist + 1); 477 (*removed)++; 478 } 479 480 ret = true; 481 } 482 } 483 return ret; 484 } 485 486 /* Main entry point for the unroll-and-jam transformation 487 described above. */ 488 489 static unsigned int 490 tree_loop_unroll_and_jam (void) 491 { 492 struct loop *loop; 493 bool changed = false; 494 495 gcc_assert (scev_initialized_p ()); 496 497 /* Go through all innermost loops. */ 498 FOR_EACH_LOOP (loop, LI_ONLY_INNERMOST) 499 { 500 struct loop *outer = loop_outer (loop); 501 502 if (loop_depth (loop) < 2 503 || optimize_loop_nest_for_size_p (outer)) 504 continue; 505 506 if (!unroll_jam_possible_p (outer, loop)) 507 continue; 508 509 vec<data_reference_p> datarefs; 510 vec<ddr_p> dependences; 511 unsigned unroll_factor, profit_unroll, removed; 512 struct tree_niter_desc desc; 513 bool unroll = false; 514 515 auto_vec<loop_p, 3> loop_nest; 516 dependences.create (10); 517 datarefs.create (10); 518 if (!compute_data_dependences_for_loop (outer, true, &loop_nest, 519 &datarefs, &dependences)) 520 { 521 if (dump_file && (dump_flags & TDF_DETAILS)) 522 fprintf (dump_file, "Cannot analyze data dependencies\n"); 523 free_data_refs (datarefs); 524 free_dependence_relations (dependences); 525 continue; 526 } 527 if (!datarefs.length ()) 528 continue; 529 530 if (dump_file && (dump_flags & TDF_DETAILS)) 531 dump_data_dependence_relations (dump_file, dependences); 532 533 unroll_factor = (unsigned)-1; 534 profit_unroll = 1; 535 removed = 0; 536 537 /* Check all dependencies. */ 538 unsigned i; 539 struct data_dependence_relation *ddr; 540 FOR_EACH_VEC_ELT (dependences, i, ddr) 541 { 542 struct data_reference *dra, *drb; 543 544 /* If the refs are independend there's nothing to do. */ 545 if (DDR_ARE_DEPENDENT (ddr) == chrec_known) 546 continue; 547 dra = DDR_A (ddr); 548 drb = DDR_B (ddr); 549 /* Nothing interesting for the self dependencies. */ 550 if (dra == drb) 551 continue; 552 553 /* Now check the distance vector, for determining a sensible 554 outer unroll factor, and for validity of merging the inner 555 loop copies. */ 556 if (!adjust_unroll_factor (loop, ddr, &unroll_factor, &profit_unroll, 557 &removed)) 558 { 559 /* Couldn't get the distance vector. For two reads that's 560 harmless (we assume we should unroll). For at least 561 one write this means we can't check the dependence direction 562 and hence can't determine safety. */ 563 564 if (DR_IS_WRITE (dra) || DR_IS_WRITE (drb)) 565 { 566 unroll_factor = 0; 567 break; 568 } 569 } 570 } 571 572 /* We regard a user-specified minimum percentage of zero as a request 573 to ignore all profitability concerns and apply the transformation 574 always. */ 575 if (!PARAM_VALUE (PARAM_UNROLL_JAM_MIN_PERCENT)) 576 profit_unroll = MAX(2, profit_unroll); 577 else if (removed * 100 / datarefs.length () 578 < (unsigned)PARAM_VALUE (PARAM_UNROLL_JAM_MIN_PERCENT)) 579 profit_unroll = 1; 580 if (unroll_factor > profit_unroll) 581 unroll_factor = profit_unroll; 582 if (unroll_factor > (unsigned)PARAM_VALUE (PARAM_UNROLL_JAM_MAX_UNROLL)) 583 unroll_factor = PARAM_VALUE (PARAM_UNROLL_JAM_MAX_UNROLL); 584 unroll = (unroll_factor > 1 585 && can_unroll_loop_p (outer, unroll_factor, &desc)); 586 587 if (unroll) 588 { 589 if (dump_enabled_p ()) 590 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | TDF_DETAILS, 591 find_loop_location (outer), 592 "applying unroll and jam with factor %d\n", 593 unroll_factor); 594 initialize_original_copy_tables (); 595 tree_unroll_loop (outer, unroll_factor, single_dom_exit (outer), 596 &desc); 597 free_original_copy_tables (); 598 fuse_loops (outer->inner); 599 changed = true; 600 } 601 602 loop_nest.release (); 603 free_dependence_relations (dependences); 604 free_data_refs (datarefs); 605 } 606 607 if (changed) 608 { 609 scev_reset (); 610 free_dominance_info (CDI_DOMINATORS); 611 return TODO_cleanup_cfg; 612 } 613 return 0; 614 } 615 616 /* Pass boilerplate */ 617 618 namespace { 619 620 const pass_data pass_data_loop_jam = 621 { 622 GIMPLE_PASS, /* type */ 623 "unrolljam", /* name */ 624 OPTGROUP_LOOP, /* optinfo_flags */ 625 TV_LOOP_JAM, /* tv_id */ 626 PROP_cfg, /* properties_required */ 627 0, /* properties_provided */ 628 0, /* properties_destroyed */ 629 0, /* todo_flags_start */ 630 0, /* todo_flags_finish */ 631 }; 632 633 class pass_loop_jam : public gimple_opt_pass 634 { 635 public: 636 pass_loop_jam (gcc::context *ctxt) 637 : gimple_opt_pass (pass_data_loop_jam, ctxt) 638 {} 639 640 /* opt_pass methods: */ 641 virtual bool gate (function *) { return flag_unroll_jam != 0; } 642 virtual unsigned int execute (function *); 643 644 }; 645 646 unsigned int 647 pass_loop_jam::execute (function *fun) 648 { 649 if (number_of_loops (fun) <= 1) 650 return 0; 651 652 return tree_loop_unroll_and_jam (); 653 } 654 655 } 656 657 gimple_opt_pass * 658 make_pass_loop_jam (gcc::context *ctxt) 659 { 660 return new pass_loop_jam (ctxt); 661 } 662