1 /* Loop unroll-and-jam. 2 Copyright (C) 2017-2018 Free Software Foundation, Inc. 3 4 This file is part of GCC. 5 6 GCC is free software; you can redistribute it and/or modify it 7 under the terms of the GNU General Public License as published by the 8 Free Software Foundation; either version 3, or (at your option) any 9 later version. 10 11 GCC is distributed in the hope that it will be useful, but WITHOUT 12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 for more details. 15 16 You should have received a copy of the GNU General Public License 17 along with GCC; see the file COPYING3. If not see 18 <http://www.gnu.org/licenses/>. */ 19 20 #include "config.h" 21 #include "system.h" 22 #include "coretypes.h" 23 #include "params.h" 24 #include "tree-pass.h" 25 #include "backend.h" 26 #include "tree.h" 27 #include "gimple.h" 28 #include "ssa.h" 29 #include "fold-const.h" 30 #include "tree-cfg.h" 31 #include "tree-ssa.h" 32 #include "tree-ssa-loop-niter.h" 33 #include "tree-ssa-loop.h" 34 #include "tree-ssa-loop-manip.h" 35 #include "cfgloop.h" 36 #include "tree-scalar-evolution.h" 37 #include "gimple-iterator.h" 38 #include "cfghooks.h" 39 #include "tree-data-ref.h" 40 #include "tree-ssa-loop-ivopts.h" 41 #include "tree-vectorizer.h" 42 43 /* Unroll and Jam transformation 44 45 This is a combination of two transformations, where the second 46 is not always valid. It's applicable if a loop nest has redundancies 47 over the iterations of an outer loop while not having that with 48 an inner loop. 49 50 Given this nest: 51 for (i) { 52 for (j) { 53 B(i,j) 54 } 55 } 56 57 first unroll: 58 for (i by 2) { 59 for (j) { 60 B(i,j) 61 } 62 for (j) { 63 B(i+1,j) 64 } 65 } 66 67 then fuse the two adjacent inner loops resulting from that: 68 for (i by 2) { 69 for (j) { 70 B(i,j) 71 B(i+1,j) 72 } 73 } 74 75 As the order of evaluations of the body B changes this is valid 76 only in certain situations: all distance vectors need to be forward. 77 Additionally if there are multiple induction variables than just 78 a counting control IV (j above) we can also deal with some situations. 79 80 The validity is checked by unroll_jam_possible_p, and the data-dep 81 testing below. 82 83 A trivial example where the fusion is wrong would be when 84 B(i,j) == x[j-1] = x[j]; 85 for (i by 2) { 86 for (j) { 87 x[j-1] = x[j]; 88 } 89 for (j) { 90 x[j-1] = x[j]; 91 } 92 } effect: move content to front by two elements 93 --> 94 for (i by 2) { 95 for (j) { 96 x[j-1] = x[j]; 97 x[j-1] = x[j]; 98 } 99 } effect: move content to front by one element 100 */ 101 102 /* Modify the loop tree for the fact that all code once belonging 103 to the OLD loop or the outer loop of OLD now is inside LOOP. */ 104 105 static void 106 merge_loop_tree (struct loop *loop, struct loop *old) 107 { 108 basic_block *bbs; 109 int i, n; 110 struct loop *subloop; 111 edge e; 112 edge_iterator ei; 113 114 /* Find its nodes. */ 115 bbs = XNEWVEC (basic_block, n_basic_blocks_for_fn (cfun)); 116 n = get_loop_body_with_size (loop, bbs, n_basic_blocks_for_fn (cfun)); 117 118 for (i = 0; i < n; i++) 119 { 120 /* If the block was direct child of OLD loop it's now part 121 of LOOP. If it was outside OLD, then it moved into LOOP 122 as well. This avoids changing the loop father for BBs 123 in inner loops of OLD. */ 124 if (bbs[i]->loop_father == old 125 || loop_depth (bbs[i]->loop_father) < loop_depth (old)) 126 { 127 remove_bb_from_loops (bbs[i]); 128 add_bb_to_loop (bbs[i], loop); 129 continue; 130 } 131 132 /* If we find a direct subloop of OLD, move it to LOOP. */ 133 subloop = bbs[i]->loop_father; 134 if (loop_outer (subloop) == old && subloop->header == bbs[i]) 135 { 136 flow_loop_tree_node_remove (subloop); 137 flow_loop_tree_node_add (loop, subloop); 138 } 139 } 140 141 /* Update the information about loop exit edges. */ 142 for (i = 0; i < n; i++) 143 { 144 FOR_EACH_EDGE (e, ei, bbs[i]->succs) 145 { 146 rescan_loop_exit (e, false, false); 147 } 148 } 149 150 loop->num_nodes = n; 151 152 free (bbs); 153 } 154 155 /* BB is part of the outer loop of an unroll-and-jam situation. 156 Check if any statements therein would prevent the transformation. */ 157 158 static bool 159 bb_prevents_fusion_p (basic_block bb) 160 { 161 gimple_stmt_iterator gsi; 162 /* BB is duplicated by outer unrolling and then all N-1 first copies 163 move into the body of the fused inner loop. If BB exits the outer loop 164 the last copy still doess so, and the first N-1 copies are cancelled 165 by loop unrolling, so also after fusion it's the exit block. 166 But there might be other reasons that prevent fusion: 167 * stores or unknown side-effects prevent fusion 168 * loads don't 169 * computations into SSA names: these aren't problematic. Their 170 result will be unused on the exit edges of the first N-1 copies 171 (those aren't taken after unrolling). If they are used on the 172 other edge (the one leading to the outer latch block) they are 173 loop-carried (on the outer loop) and the Nth copy of BB will 174 compute them again (i.e. the first N-1 copies will be dead). */ 175 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi)) 176 { 177 gimple *g = gsi_stmt (gsi); 178 if (gimple_vdef (g) || gimple_has_side_effects (g)) 179 return true; 180 } 181 return false; 182 } 183 184 /* Given an inner loop LOOP (of some OUTER loop) determine if 185 we can safely fuse copies of it (generated by outer unrolling). 186 If so return true, otherwise return false. */ 187 188 static bool 189 unroll_jam_possible_p (struct loop *outer, struct loop *loop) 190 { 191 basic_block *bbs; 192 int i, n; 193 struct tree_niter_desc niter; 194 195 /* When fusing the loops we skip the latch block 196 of the first one, so it mustn't have any effects to 197 preserve. */ 198 if (!empty_block_p (loop->latch)) 199 return false; 200 201 if (!single_exit (loop)) 202 return false; 203 204 /* We need a perfect nest. Quick check for adjacent inner loops. */ 205 if (outer->inner != loop || loop->next) 206 return false; 207 208 /* Prevent head-controlled inner loops, that we usually have. 209 The guard block would need to be accepted 210 (invariant condition either entering or skipping the loop), 211 without also accepting arbitrary control flow. When unswitching 212 ran before us (as with -O3) this won't be a problem because its 213 outer loop unswitching will have moved out the invariant condition. 214 215 If we do that we need to extend fuse_loops() to cope with this 216 by threading through the (still invariant) copied condition 217 between the two loop copies. */ 218 if (!dominated_by_p (CDI_DOMINATORS, outer->latch, loop->header)) 219 return false; 220 221 /* The number of iterations of the inner loop must be loop invariant 222 with respect to the outer loop. */ 223 if (!number_of_iterations_exit (loop, single_exit (loop), &niter, 224 false, true) 225 || niter.cmp == ERROR_MARK 226 || !integer_zerop (niter.may_be_zero) 227 || !expr_invariant_in_loop_p (outer, niter.niter)) 228 return false; 229 230 /* And check blocks belonging to just outer loop. */ 231 bbs = XNEWVEC (basic_block, n_basic_blocks_for_fn (cfun)); 232 n = get_loop_body_with_size (outer, bbs, n_basic_blocks_for_fn (cfun)); 233 234 for (i = 0; i < n; i++) 235 if (bbs[i]->loop_father == outer && bb_prevents_fusion_p (bbs[i])) 236 break; 237 free (bbs); 238 if (i != n) 239 return false; 240 241 /* For now we can safely fuse copies of LOOP only if all 242 loop carried variables are inductions (or the virtual op). 243 244 We could handle reductions as well (the initial value in the second 245 body would be the after-iter value of the first body) if it's over 246 an associative and commutative operation. We wouldn't 247 be able to handle unknown cycles. */ 248 gphi_iterator psi; 249 for (psi = gsi_start_phis (loop->header); !gsi_end_p (psi); gsi_next (&psi)) 250 { 251 affine_iv iv; 252 tree op = gimple_phi_result (psi.phi ()); 253 254 if (virtual_operand_p (op)) 255 continue; 256 if (!simple_iv (loop, loop, op, &iv, true)) 257 return false; 258 /* The inductions must be regular, loop invariant step and initial 259 value. */ 260 if (!expr_invariant_in_loop_p (outer, iv.step) 261 || !expr_invariant_in_loop_p (outer, iv.base)) 262 return false; 263 /* XXX With more effort we could also be able to deal with inductions 264 where the initial value is loop variant but a simple IV in the 265 outer loop. The initial value for the second body would be 266 the original initial value plus iv.base.step. The next value 267 for the fused loop would be the original next value of the first 268 copy, _not_ the next value of the second body. */ 269 } 270 271 return true; 272 } 273 274 /* Fuse LOOP with all further neighbors. The loops are expected to 275 be in appropriate form. */ 276 277 static void 278 fuse_loops (struct loop *loop) 279 { 280 struct loop *next = loop->next; 281 282 while (next) 283 { 284 edge e; 285 286 remove_branch (single_pred_edge (loop->latch)); 287 /* Make delete_basic_block not fiddle with the loop structure. */ 288 basic_block oldlatch = loop->latch; 289 loop->latch = NULL; 290 delete_basic_block (oldlatch); 291 e = redirect_edge_and_branch (loop_latch_edge (next), 292 loop->header); 293 loop->latch = e->src; 294 flush_pending_stmts (e); 295 296 gcc_assert (EDGE_COUNT (next->header->preds) == 1); 297 298 /* The PHI nodes of the second body (single-argument now) 299 need adjustments to use the right values: either directly 300 the value of the corresponding PHI in the first copy or 301 the one leaving the first body which unrolling did for us. 302 303 See also unroll_jam_possible_p() for further possibilities. */ 304 gphi_iterator psi_first, psi_second; 305 e = single_pred_edge (next->header); 306 for (psi_first = gsi_start_phis (loop->header), 307 psi_second = gsi_start_phis (next->header); 308 !gsi_end_p (psi_first); 309 gsi_next (&psi_first), gsi_next (&psi_second)) 310 { 311 gphi *phi_first = psi_first.phi (); 312 gphi *phi_second = psi_second.phi (); 313 tree firstop = gimple_phi_result (phi_first); 314 /* The virtual operand is correct already as it's 315 always live at exit, hence has a LCSSA node and outer 316 loop unrolling updated SSA form. */ 317 if (virtual_operand_p (firstop)) 318 continue; 319 320 /* Due to unroll_jam_possible_p() we know that this is 321 an induction. The second body goes over the same 322 iteration space. */ 323 add_phi_arg (phi_second, firstop, e, 324 gimple_location (phi_first)); 325 } 326 gcc_assert (gsi_end_p (psi_second)); 327 328 merge_loop_tree (loop, next); 329 gcc_assert (!next->num_nodes); 330 struct loop *ln = next->next; 331 delete_loop (next); 332 next = ln; 333 } 334 rewrite_into_loop_closed_ssa_1 (NULL, 0, SSA_OP_USE, loop); 335 } 336 337 /* Returns true if the distance in DDR can be determined and adjusts 338 the unroll factor in *UNROLL to make unrolling valid for that distance. 339 Otherwise return false. 340 341 If this data dep can lead to a removed memory reference, increment 342 *REMOVED and adjust *PROFIT_UNROLL to be the necessary unroll factor 343 for this to happen. */ 344 345 static bool 346 adjust_unroll_factor (struct data_dependence_relation *ddr, 347 unsigned *unroll, unsigned *profit_unroll, 348 unsigned *removed) 349 { 350 bool ret = false; 351 if (DDR_ARE_DEPENDENT (ddr) != chrec_known) 352 { 353 if (DDR_NUM_DIST_VECTS (ddr) == 0) 354 return false; 355 unsigned i; 356 lambda_vector dist_v; 357 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v) 358 { 359 /* A distance (a,b) is at worst transformed into (a/N,b) by the 360 unrolling (factor N), so the transformation is valid if 361 a >= N, or b > 0, or b is zero and a > 0. Otherwise the unroll 362 factor needs to be limited so that the first condition holds. 363 That may limit the factor down to zero in the worst case. */ 364 int dist = dist_v[0]; 365 if (dist < 0) 366 gcc_unreachable (); 367 else if ((unsigned)dist >= *unroll) 368 ; 369 else if (lambda_vector_lexico_pos (dist_v + 1, DDR_NB_LOOPS (ddr) - 1) 370 || (lambda_vector_zerop (dist_v + 1, DDR_NB_LOOPS (ddr) - 1) 371 && dist > 0)) 372 ; 373 else 374 *unroll = dist; 375 376 /* With a distance (a,0) it's always profitable to unroll-and-jam 377 (by a+1), because one memory reference will go away. With 378 (a,b) and b != 0 that's less clear. We will increase the 379 number of streams without lowering the number of mem refs. 380 So for now only handle the first situation. */ 381 if (lambda_vector_zerop (dist_v + 1, DDR_NB_LOOPS (ddr) - 1)) 382 { 383 *profit_unroll = MAX (*profit_unroll, (unsigned)dist + 1); 384 (*removed)++; 385 } 386 387 ret = true; 388 } 389 } 390 return ret; 391 } 392 393 /* Main entry point for the unroll-and-jam transformation 394 described above. */ 395 396 static unsigned int 397 tree_loop_unroll_and_jam (void) 398 { 399 struct loop *loop; 400 bool changed = false; 401 402 gcc_assert (scev_initialized_p ()); 403 404 /* Go through all innermost loops. */ 405 FOR_EACH_LOOP (loop, LI_ONLY_INNERMOST) 406 { 407 struct loop *outer = loop_outer (loop); 408 409 if (loop_depth (loop) < 2 410 || optimize_loop_nest_for_size_p (outer)) 411 continue; 412 413 if (!unroll_jam_possible_p (outer, loop)) 414 continue; 415 416 vec<data_reference_p> datarefs; 417 vec<ddr_p> dependences; 418 unsigned unroll_factor, profit_unroll, removed; 419 struct tree_niter_desc desc; 420 bool unroll = false; 421 422 auto_vec<loop_p, 3> loop_nest; 423 dependences.create (10); 424 datarefs.create (10); 425 if (!compute_data_dependences_for_loop (outer, true, &loop_nest, 426 &datarefs, &dependences)) 427 { 428 if (dump_file && (dump_flags & TDF_DETAILS)) 429 fprintf (dump_file, "Cannot analyze data dependencies\n"); 430 free_data_refs (datarefs); 431 free_dependence_relations (dependences); 432 return false; 433 } 434 if (!datarefs.length ()) 435 continue; 436 437 if (dump_file && (dump_flags & TDF_DETAILS)) 438 dump_data_dependence_relations (dump_file, dependences); 439 440 unroll_factor = (unsigned)-1; 441 profit_unroll = 1; 442 removed = 0; 443 444 /* Check all dependencies. */ 445 unsigned i; 446 struct data_dependence_relation *ddr; 447 FOR_EACH_VEC_ELT (dependences, i, ddr) 448 { 449 struct data_reference *dra, *drb; 450 451 /* If the refs are independend there's nothing to do. */ 452 if (DDR_ARE_DEPENDENT (ddr) == chrec_known) 453 continue; 454 dra = DDR_A (ddr); 455 drb = DDR_B (ddr); 456 /* Nothing interesting for the self dependencies. */ 457 if (dra == drb) 458 continue; 459 460 /* Now check the distance vector, for determining a sensible 461 outer unroll factor, and for validity of merging the inner 462 loop copies. */ 463 if (!adjust_unroll_factor (ddr, &unroll_factor, &profit_unroll, 464 &removed)) 465 { 466 /* Couldn't get the distance vector. For two reads that's 467 harmless (we assume we should unroll). For at least 468 one write this means we can't check the dependence direction 469 and hence can't determine safety. */ 470 471 if (DR_IS_WRITE (dra) || DR_IS_WRITE (drb)) 472 { 473 unroll_factor = 0; 474 break; 475 } 476 } 477 } 478 479 /* We regard a user-specified minimum percentage of zero as a request 480 to ignore all profitability concerns and apply the transformation 481 always. */ 482 if (!PARAM_VALUE (PARAM_UNROLL_JAM_MIN_PERCENT)) 483 profit_unroll = 2; 484 else if (removed * 100 / datarefs.length () 485 < (unsigned)PARAM_VALUE (PARAM_UNROLL_JAM_MIN_PERCENT)) 486 profit_unroll = 1; 487 if (unroll_factor > profit_unroll) 488 unroll_factor = profit_unroll; 489 if (unroll_factor > (unsigned)PARAM_VALUE (PARAM_UNROLL_JAM_MAX_UNROLL)) 490 unroll_factor = PARAM_VALUE (PARAM_UNROLL_JAM_MAX_UNROLL); 491 unroll = (unroll_factor > 1 492 && can_unroll_loop_p (outer, unroll_factor, &desc)); 493 494 if (unroll) 495 { 496 if (dump_enabled_p ()) 497 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | TDF_DETAILS, 498 find_loop_location (outer), 499 "applying unroll and jam with factor %d\n", 500 unroll_factor); 501 initialize_original_copy_tables (); 502 tree_unroll_loop (outer, unroll_factor, single_dom_exit (outer), 503 &desc); 504 free_original_copy_tables (); 505 fuse_loops (outer->inner); 506 changed = true; 507 } 508 509 loop_nest.release (); 510 free_dependence_relations (dependences); 511 free_data_refs (datarefs); 512 } 513 514 if (changed) 515 { 516 scev_reset (); 517 free_dominance_info (CDI_DOMINATORS); 518 return TODO_cleanup_cfg; 519 } 520 return 0; 521 } 522 523 /* Pass boilerplate */ 524 525 namespace { 526 527 const pass_data pass_data_loop_jam = 528 { 529 GIMPLE_PASS, /* type */ 530 "unrolljam", /* name */ 531 OPTGROUP_LOOP, /* optinfo_flags */ 532 TV_LOOP_JAM, /* tv_id */ 533 PROP_cfg, /* properties_required */ 534 0, /* properties_provided */ 535 0, /* properties_destroyed */ 536 0, /* todo_flags_start */ 537 0, /* todo_flags_finish */ 538 }; 539 540 class pass_loop_jam : public gimple_opt_pass 541 { 542 public: 543 pass_loop_jam (gcc::context *ctxt) 544 : gimple_opt_pass (pass_data_loop_jam, ctxt) 545 {} 546 547 /* opt_pass methods: */ 548 virtual bool gate (function *) { return flag_unroll_jam != 0; } 549 virtual unsigned int execute (function *); 550 551 }; 552 553 unsigned int 554 pass_loop_jam::execute (function *fun) 555 { 556 if (number_of_loops (fun) <= 1) 557 return 0; 558 559 return tree_loop_unroll_and_jam (); 560 } 561 562 } 563 564 gimple_opt_pass * 565 make_pass_loop_jam (gcc::context *ctxt) 566 { 567 return new pass_loop_jam (ctxt); 568 } 569