1 /* Array prefetching. 2 Copyright (C) 2005, 2007, 2008 Free Software Foundation, Inc. 3 4 This file is part of GCC. 5 6 GCC is free software; you can redistribute it and/or modify it 7 under the terms of the GNU General Public License as published by the 8 Free Software Foundation; either version 3, or (at your option) any 9 later version. 10 11 GCC is distributed in the hope that it will be useful, but WITHOUT 12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 for more details. 15 16 You should have received a copy of the GNU General Public License 17 along with GCC; see the file COPYING3. If not see 18 <http://www.gnu.org/licenses/>. */ 19 20 #include "config.h" 21 #include "system.h" 22 #include "coretypes.h" 23 #include "tm.h" 24 #include "tree.h" 25 #include "rtl.h" 26 #include "tm_p.h" 27 #include "hard-reg-set.h" 28 #include "basic-block.h" 29 #include "output.h" 30 #include "diagnostic.h" 31 #include "tree-flow.h" 32 #include "tree-dump.h" 33 #include "timevar.h" 34 #include "cfgloop.h" 35 #include "varray.h" 36 #include "expr.h" 37 #include "tree-pass.h" 38 #include "ggc.h" 39 #include "insn-config.h" 40 #include "recog.h" 41 #include "hashtab.h" 42 #include "tree-chrec.h" 43 #include "tree-scalar-evolution.h" 44 #include "toplev.h" 45 #include "params.h" 46 #include "langhooks.h" 47 #include "tree-inline.h" 48 #include "tree-data-ref.h" 49 #include "optabs.h" 50 51 /* This pass inserts prefetch instructions to optimize cache usage during 52 accesses to arrays in loops. It processes loops sequentially and: 53 54 1) Gathers all memory references in the single loop. 55 2) For each of the references it decides when it is profitable to prefetch 56 it. To do it, we evaluate the reuse among the accesses, and determines 57 two values: PREFETCH_BEFORE (meaning that it only makes sense to do 58 prefetching in the first PREFETCH_BEFORE iterations of the loop) and 59 PREFETCH_MOD (meaning that it only makes sense to prefetch in the 60 iterations of the loop that are zero modulo PREFETCH_MOD). For example 61 (assuming cache line size is 64 bytes, char has size 1 byte and there 62 is no hardware sequential prefetch): 63 64 char *a; 65 for (i = 0; i < max; i++) 66 { 67 a[255] = ...; (0) 68 a[i] = ...; (1) 69 a[i + 64] = ...; (2) 70 a[16*i] = ...; (3) 71 a[187*i] = ...; (4) 72 a[187*i + 50] = ...; (5) 73 } 74 75 (0) obviously has PREFETCH_BEFORE 1 76 (1) has PREFETCH_BEFORE 64, since (2) accesses the same memory 77 location 64 iterations before it, and PREFETCH_MOD 64 (since 78 it hits the same cache line otherwise). 79 (2) has PREFETCH_MOD 64 80 (3) has PREFETCH_MOD 4 81 (4) has PREFETCH_MOD 1. We do not set PREFETCH_BEFORE here, since 82 the cache line accessed by (4) is the same with probability only 83 7/32. 84 (5) has PREFETCH_MOD 1 as well. 85 86 Additionally, we use data dependence analysis to determine for each 87 reference the distance till the first reuse; this information is used 88 to determine the temporality of the issued prefetch instruction. 89 90 3) We determine how much ahead we need to prefetch. The number of 91 iterations needed is time to fetch / time spent in one iteration of 92 the loop. The problem is that we do not know either of these values, 93 so we just make a heuristic guess based on a magic (possibly) 94 target-specific constant and size of the loop. 95 96 4) Determine which of the references we prefetch. We take into account 97 that there is a maximum number of simultaneous prefetches (provided 98 by machine description). We prefetch as many prefetches as possible 99 while still within this bound (starting with those with lowest 100 prefetch_mod, since they are responsible for most of the cache 101 misses). 102 103 5) We unroll and peel loops so that we are able to satisfy PREFETCH_MOD 104 and PREFETCH_BEFORE requirements (within some bounds), and to avoid 105 prefetching nonaccessed memory. 106 TODO -- actually implement peeling. 107 108 6) We actually emit the prefetch instructions. ??? Perhaps emit the 109 prefetch instructions with guards in cases where 5) was not sufficient 110 to satisfy the constraints? 111 112 The function is_loop_prefetching_profitable() implements a cost model 113 to determine if prefetching is profitable for a given loop. The cost 114 model has two heuristcs: 115 1. A heuristic that determines whether the given loop has enough CPU 116 ops that can be overlapped with cache missing memory ops. 117 If not, the loop won't benefit from prefetching. This is implemented 118 by requirung the ratio between the instruction count and the mem ref 119 count to be above a certain minimum. 120 2. A heuristic that disables prefetching in a loop with an unknown trip 121 count if the prefetching cost is above a certain limit. The relative 122 prefetching cost is estimated by taking the ratio between the 123 prefetch count and the total intruction count (this models the I-cache 124 cost). 125 The limits used in these heuristics are defined as parameters with 126 reasonable default values. Machine-specific default values will be 127 added later. 128 129 Some other TODO: 130 -- write and use more general reuse analysis (that could be also used 131 in other cache aimed loop optimizations) 132 -- make it behave sanely together with the prefetches given by user 133 (now we just ignore them; at the very least we should avoid 134 optimizing loops in that user put his own prefetches) 135 -- we assume cache line size alignment of arrays; this could be 136 improved. */ 137 138 /* Magic constants follow. These should be replaced by machine specific 139 numbers. */ 140 141 /* True if write can be prefetched by a read prefetch. */ 142 143 #ifndef WRITE_CAN_USE_READ_PREFETCH 144 #define WRITE_CAN_USE_READ_PREFETCH 1 145 #endif 146 147 /* True if read can be prefetched by a write prefetch. */ 148 149 #ifndef READ_CAN_USE_WRITE_PREFETCH 150 #define READ_CAN_USE_WRITE_PREFETCH 0 151 #endif 152 153 /* The size of the block loaded by a single prefetch. Usually, this is 154 the same as cache line size (at the moment, we only consider one level 155 of cache hierarchy). */ 156 157 #ifndef PREFETCH_BLOCK 158 #define PREFETCH_BLOCK L1_CACHE_LINE_SIZE 159 #endif 160 161 /* Do we have a forward hardware sequential prefetching? */ 162 163 #ifndef HAVE_FORWARD_PREFETCH 164 #define HAVE_FORWARD_PREFETCH 0 165 #endif 166 167 /* Do we have a backward hardware sequential prefetching? */ 168 169 #ifndef HAVE_BACKWARD_PREFETCH 170 #define HAVE_BACKWARD_PREFETCH 0 171 #endif 172 173 /* In some cases we are only able to determine that there is a certain 174 probability that the two accesses hit the same cache line. In this 175 case, we issue the prefetches for both of them if this probability 176 is less then (1000 - ACCEPTABLE_MISS_RATE) per thousand. */ 177 178 #ifndef ACCEPTABLE_MISS_RATE 179 #define ACCEPTABLE_MISS_RATE 50 180 #endif 181 182 #ifndef HAVE_prefetch 183 #define HAVE_prefetch 0 184 #endif 185 186 #define L1_CACHE_SIZE_BYTES ((unsigned) (L1_CACHE_SIZE * 1024)) 187 #define L2_CACHE_SIZE_BYTES ((unsigned) (L2_CACHE_SIZE * 1024)) 188 189 /* We consider a memory access nontemporal if it is not reused sooner than 190 after L2_CACHE_SIZE_BYTES of memory are accessed. However, we ignore 191 accesses closer than L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION, 192 so that we use nontemporal prefetches e.g. if single memory location 193 is accessed several times in a single iteration of the loop. */ 194 #define NONTEMPORAL_FRACTION 16 195 196 /* In case we have to emit a memory fence instruction after the loop that 197 uses nontemporal stores, this defines the builtin to use. */ 198 199 #ifndef FENCE_FOLLOWING_MOVNT 200 #define FENCE_FOLLOWING_MOVNT NULL_TREE 201 #endif 202 203 /* The group of references between that reuse may occur. */ 204 205 struct mem_ref_group 206 { 207 tree base; /* Base of the reference. */ 208 HOST_WIDE_INT step; /* Step of the reference. */ 209 struct mem_ref *refs; /* References in the group. */ 210 struct mem_ref_group *next; /* Next group of references. */ 211 }; 212 213 /* Assigned to PREFETCH_BEFORE when all iterations are to be prefetched. */ 214 215 #define PREFETCH_ALL (~(unsigned HOST_WIDE_INT) 0) 216 217 /* The memory reference. */ 218 219 struct mem_ref 220 { 221 gimple stmt; /* Statement in that the reference appears. */ 222 tree mem; /* The reference. */ 223 HOST_WIDE_INT delta; /* Constant offset of the reference. */ 224 struct mem_ref_group *group; /* The group of references it belongs to. */ 225 unsigned HOST_WIDE_INT prefetch_mod; 226 /* Prefetch only each PREFETCH_MOD-th 227 iteration. */ 228 unsigned HOST_WIDE_INT prefetch_before; 229 /* Prefetch only first PREFETCH_BEFORE 230 iterations. */ 231 unsigned reuse_distance; /* The amount of data accessed before the first 232 reuse of this value. */ 233 struct mem_ref *next; /* The next reference in the group. */ 234 unsigned write_p : 1; /* Is it a write? */ 235 unsigned independent_p : 1; /* True if the reference is independent on 236 all other references inside the loop. */ 237 unsigned issue_prefetch_p : 1; /* Should we really issue the prefetch? */ 238 unsigned storent_p : 1; /* True if we changed the store to a 239 nontemporal one. */ 240 }; 241 242 /* Dumps information about reference REF to FILE. */ 243 244 static void 245 dump_mem_ref (FILE *file, struct mem_ref *ref) 246 { 247 fprintf (file, "Reference %p:\n", (void *) ref); 248 249 fprintf (file, " group %p (base ", (void *) ref->group); 250 print_generic_expr (file, ref->group->base, TDF_SLIM); 251 fprintf (file, ", step "); 252 fprintf (file, HOST_WIDE_INT_PRINT_DEC, ref->group->step); 253 fprintf (file, ")\n"); 254 255 fprintf (file, " delta "); 256 fprintf (file, HOST_WIDE_INT_PRINT_DEC, ref->delta); 257 fprintf (file, "\n"); 258 259 fprintf (file, " %s\n", ref->write_p ? "write" : "read"); 260 261 fprintf (file, "\n"); 262 } 263 264 /* Finds a group with BASE and STEP in GROUPS, or creates one if it does not 265 exist. */ 266 267 static struct mem_ref_group * 268 find_or_create_group (struct mem_ref_group **groups, tree base, 269 HOST_WIDE_INT step) 270 { 271 struct mem_ref_group *group; 272 273 for (; *groups; groups = &(*groups)->next) 274 { 275 if ((*groups)->step == step 276 && operand_equal_p ((*groups)->base, base, 0)) 277 return *groups; 278 279 /* Keep the list of groups sorted by decreasing step. */ 280 if ((*groups)->step < step) 281 break; 282 } 283 284 group = XNEW (struct mem_ref_group); 285 group->base = base; 286 group->step = step; 287 group->refs = NULL; 288 group->next = *groups; 289 *groups = group; 290 291 return group; 292 } 293 294 /* Records a memory reference MEM in GROUP with offset DELTA and write status 295 WRITE_P. The reference occurs in statement STMT. */ 296 297 static void 298 record_ref (struct mem_ref_group *group, gimple stmt, tree mem, 299 HOST_WIDE_INT delta, bool write_p) 300 { 301 struct mem_ref **aref; 302 303 /* Do not record the same address twice. */ 304 for (aref = &group->refs; *aref; aref = &(*aref)->next) 305 { 306 /* It does not have to be possible for write reference to reuse the read 307 prefetch, or vice versa. */ 308 if (!WRITE_CAN_USE_READ_PREFETCH 309 && write_p 310 && !(*aref)->write_p) 311 continue; 312 if (!READ_CAN_USE_WRITE_PREFETCH 313 && !write_p 314 && (*aref)->write_p) 315 continue; 316 317 if ((*aref)->delta == delta) 318 return; 319 } 320 321 (*aref) = XNEW (struct mem_ref); 322 (*aref)->stmt = stmt; 323 (*aref)->mem = mem; 324 (*aref)->delta = delta; 325 (*aref)->write_p = write_p; 326 (*aref)->prefetch_before = PREFETCH_ALL; 327 (*aref)->prefetch_mod = 1; 328 (*aref)->reuse_distance = 0; 329 (*aref)->issue_prefetch_p = false; 330 (*aref)->group = group; 331 (*aref)->next = NULL; 332 (*aref)->independent_p = false; 333 (*aref)->storent_p = false; 334 335 if (dump_file && (dump_flags & TDF_DETAILS)) 336 dump_mem_ref (dump_file, *aref); 337 } 338 339 /* Release memory references in GROUPS. */ 340 341 static void 342 release_mem_refs (struct mem_ref_group *groups) 343 { 344 struct mem_ref_group *next_g; 345 struct mem_ref *ref, *next_r; 346 347 for (; groups; groups = next_g) 348 { 349 next_g = groups->next; 350 for (ref = groups->refs; ref; ref = next_r) 351 { 352 next_r = ref->next; 353 free (ref); 354 } 355 free (groups); 356 } 357 } 358 359 /* A structure used to pass arguments to idx_analyze_ref. */ 360 361 struct ar_data 362 { 363 struct loop *loop; /* Loop of the reference. */ 364 gimple stmt; /* Statement of the reference. */ 365 HOST_WIDE_INT *step; /* Step of the memory reference. */ 366 HOST_WIDE_INT *delta; /* Offset of the memory reference. */ 367 }; 368 369 /* Analyzes a single INDEX of a memory reference to obtain information 370 described at analyze_ref. Callback for for_each_index. */ 371 372 static bool 373 idx_analyze_ref (tree base, tree *index, void *data) 374 { 375 struct ar_data *ar_data = (struct ar_data *) data; 376 tree ibase, step, stepsize; 377 HOST_WIDE_INT istep, idelta = 0, imult = 1; 378 affine_iv iv; 379 380 if (TREE_CODE (base) == MISALIGNED_INDIRECT_REF 381 || TREE_CODE (base) == ALIGN_INDIRECT_REF) 382 return false; 383 384 if (!simple_iv (ar_data->loop, loop_containing_stmt (ar_data->stmt), 385 *index, &iv, false)) 386 return false; 387 ibase = iv.base; 388 step = iv.step; 389 390 if (!cst_and_fits_in_hwi (step)) 391 return false; 392 istep = int_cst_value (step); 393 394 if (TREE_CODE (ibase) == POINTER_PLUS_EXPR 395 && cst_and_fits_in_hwi (TREE_OPERAND (ibase, 1))) 396 { 397 idelta = int_cst_value (TREE_OPERAND (ibase, 1)); 398 ibase = TREE_OPERAND (ibase, 0); 399 } 400 if (cst_and_fits_in_hwi (ibase)) 401 { 402 idelta += int_cst_value (ibase); 403 ibase = build_int_cst (TREE_TYPE (ibase), 0); 404 } 405 406 if (TREE_CODE (base) == ARRAY_REF) 407 { 408 stepsize = array_ref_element_size (base); 409 if (!cst_and_fits_in_hwi (stepsize)) 410 return false; 411 imult = int_cst_value (stepsize); 412 413 istep *= imult; 414 idelta *= imult; 415 } 416 417 *ar_data->step += istep; 418 *ar_data->delta += idelta; 419 *index = ibase; 420 421 return true; 422 } 423 424 /* Tries to express REF_P in shape &BASE + STEP * iter + DELTA, where DELTA and 425 STEP are integer constants and iter is number of iterations of LOOP. The 426 reference occurs in statement STMT. Strips nonaddressable component 427 references from REF_P. */ 428 429 static bool 430 analyze_ref (struct loop *loop, tree *ref_p, tree *base, 431 HOST_WIDE_INT *step, HOST_WIDE_INT *delta, 432 gimple stmt) 433 { 434 struct ar_data ar_data; 435 tree off; 436 HOST_WIDE_INT bit_offset; 437 tree ref = *ref_p; 438 439 *step = 0; 440 *delta = 0; 441 442 /* First strip off the component references. Ignore bitfields. */ 443 if (TREE_CODE (ref) == COMPONENT_REF 444 && DECL_NONADDRESSABLE_P (TREE_OPERAND (ref, 1))) 445 ref = TREE_OPERAND (ref, 0); 446 447 *ref_p = ref; 448 449 for (; TREE_CODE (ref) == COMPONENT_REF; ref = TREE_OPERAND (ref, 0)) 450 { 451 off = DECL_FIELD_BIT_OFFSET (TREE_OPERAND (ref, 1)); 452 bit_offset = TREE_INT_CST_LOW (off); 453 gcc_assert (bit_offset % BITS_PER_UNIT == 0); 454 455 *delta += bit_offset / BITS_PER_UNIT; 456 } 457 458 *base = unshare_expr (ref); 459 ar_data.loop = loop; 460 ar_data.stmt = stmt; 461 ar_data.step = step; 462 ar_data.delta = delta; 463 return for_each_index (base, idx_analyze_ref, &ar_data); 464 } 465 466 /* Record a memory reference REF to the list REFS. The reference occurs in 467 LOOP in statement STMT and it is write if WRITE_P. Returns true if the 468 reference was recorded, false otherwise. */ 469 470 static bool 471 gather_memory_references_ref (struct loop *loop, struct mem_ref_group **refs, 472 tree ref, bool write_p, gimple stmt) 473 { 474 tree base; 475 HOST_WIDE_INT step, delta; 476 struct mem_ref_group *agrp; 477 478 if (get_base_address (ref) == NULL) 479 return false; 480 481 if (!analyze_ref (loop, &ref, &base, &step, &delta, stmt)) 482 return false; 483 484 /* Stop if the address of BASE could not be taken. */ 485 if (may_be_nonaddressable_p (base)) 486 return false; 487 488 /* Now we know that REF = &BASE + STEP * iter + DELTA, where DELTA and STEP 489 are integer constants. */ 490 agrp = find_or_create_group (refs, base, step); 491 record_ref (agrp, stmt, ref, delta, write_p); 492 493 return true; 494 } 495 496 /* Record the suitable memory references in LOOP. NO_OTHER_REFS is set to 497 true if there are no other memory references inside the loop. */ 498 499 static struct mem_ref_group * 500 gather_memory_references (struct loop *loop, bool *no_other_refs, unsigned *ref_count) 501 { 502 basic_block *body = get_loop_body_in_dom_order (loop); 503 basic_block bb; 504 unsigned i; 505 gimple_stmt_iterator bsi; 506 gimple stmt; 507 tree lhs, rhs; 508 struct mem_ref_group *refs = NULL; 509 510 *no_other_refs = true; 511 *ref_count = 0; 512 513 /* Scan the loop body in order, so that the former references precede the 514 later ones. */ 515 for (i = 0; i < loop->num_nodes; i++) 516 { 517 bb = body[i]; 518 if (bb->loop_father != loop) 519 continue; 520 521 for (bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi)) 522 { 523 stmt = gsi_stmt (bsi); 524 525 if (gimple_code (stmt) != GIMPLE_ASSIGN) 526 { 527 if (gimple_vuse (stmt) 528 || (is_gimple_call (stmt) 529 && !(gimple_call_flags (stmt) & ECF_CONST))) 530 *no_other_refs = false; 531 continue; 532 } 533 534 lhs = gimple_assign_lhs (stmt); 535 rhs = gimple_assign_rhs1 (stmt); 536 537 if (REFERENCE_CLASS_P (rhs)) 538 { 539 *no_other_refs &= gather_memory_references_ref (loop, &refs, 540 rhs, false, stmt); 541 *ref_count += 1; 542 } 543 if (REFERENCE_CLASS_P (lhs)) 544 { 545 *no_other_refs &= gather_memory_references_ref (loop, &refs, 546 lhs, true, stmt); 547 *ref_count += 1; 548 } 549 } 550 } 551 free (body); 552 553 return refs; 554 } 555 556 /* Prune the prefetch candidate REF using the self-reuse. */ 557 558 static void 559 prune_ref_by_self_reuse (struct mem_ref *ref) 560 { 561 HOST_WIDE_INT step = ref->group->step; 562 bool backward = step < 0; 563 564 if (step == 0) 565 { 566 /* Prefetch references to invariant address just once. */ 567 ref->prefetch_before = 1; 568 return; 569 } 570 571 if (backward) 572 step = -step; 573 574 if (step > PREFETCH_BLOCK) 575 return; 576 577 if ((backward && HAVE_BACKWARD_PREFETCH) 578 || (!backward && HAVE_FORWARD_PREFETCH)) 579 { 580 ref->prefetch_before = 1; 581 return; 582 } 583 584 ref->prefetch_mod = PREFETCH_BLOCK / step; 585 } 586 587 /* Divides X by BY, rounding down. */ 588 589 static HOST_WIDE_INT 590 ddown (HOST_WIDE_INT x, unsigned HOST_WIDE_INT by) 591 { 592 gcc_assert (by > 0); 593 594 if (x >= 0) 595 return x / by; 596 else 597 return (x + by - 1) / by; 598 } 599 600 /* Given a CACHE_LINE_SIZE and two inductive memory references 601 with a common STEP greater than CACHE_LINE_SIZE and an address 602 difference DELTA, compute the probability that they will fall 603 in different cache lines. DISTINCT_ITERS is the number of 604 distinct iterations after which the pattern repeats itself. 605 ALIGN_UNIT is the unit of alignment in bytes. */ 606 607 static int 608 compute_miss_rate (unsigned HOST_WIDE_INT cache_line_size, 609 HOST_WIDE_INT step, HOST_WIDE_INT delta, 610 unsigned HOST_WIDE_INT distinct_iters, 611 int align_unit) 612 { 613 unsigned align, iter; 614 int total_positions, miss_positions, miss_rate; 615 int address1, address2, cache_line1, cache_line2; 616 617 total_positions = 0; 618 miss_positions = 0; 619 620 /* Iterate through all possible alignments of the first 621 memory reference within its cache line. */ 622 for (align = 0; align < cache_line_size; align += align_unit) 623 624 /* Iterate through all distinct iterations. */ 625 for (iter = 0; iter < distinct_iters; iter++) 626 { 627 address1 = align + step * iter; 628 address2 = address1 + delta; 629 cache_line1 = address1 / cache_line_size; 630 cache_line2 = address2 / cache_line_size; 631 total_positions += 1; 632 if (cache_line1 != cache_line2) 633 miss_positions += 1; 634 } 635 miss_rate = 1000 * miss_positions / total_positions; 636 return miss_rate; 637 } 638 639 /* Prune the prefetch candidate REF using the reuse with BY. 640 If BY_IS_BEFORE is true, BY is before REF in the loop. */ 641 642 static void 643 prune_ref_by_group_reuse (struct mem_ref *ref, struct mem_ref *by, 644 bool by_is_before) 645 { 646 HOST_WIDE_INT step = ref->group->step; 647 bool backward = step < 0; 648 HOST_WIDE_INT delta_r = ref->delta, delta_b = by->delta; 649 HOST_WIDE_INT delta = delta_b - delta_r; 650 HOST_WIDE_INT hit_from; 651 unsigned HOST_WIDE_INT prefetch_before, prefetch_block; 652 int miss_rate; 653 HOST_WIDE_INT reduced_step; 654 unsigned HOST_WIDE_INT reduced_prefetch_block; 655 tree ref_type; 656 int align_unit; 657 658 if (delta == 0) 659 { 660 /* If the references has the same address, only prefetch the 661 former. */ 662 if (by_is_before) 663 ref->prefetch_before = 0; 664 665 return; 666 } 667 668 if (!step) 669 { 670 /* If the reference addresses are invariant and fall into the 671 same cache line, prefetch just the first one. */ 672 if (!by_is_before) 673 return; 674 675 if (ddown (ref->delta, PREFETCH_BLOCK) 676 != ddown (by->delta, PREFETCH_BLOCK)) 677 return; 678 679 ref->prefetch_before = 0; 680 return; 681 } 682 683 /* Only prune the reference that is behind in the array. */ 684 if (backward) 685 { 686 if (delta > 0) 687 return; 688 689 /* Transform the data so that we may assume that the accesses 690 are forward. */ 691 delta = - delta; 692 step = -step; 693 delta_r = PREFETCH_BLOCK - 1 - delta_r; 694 delta_b = PREFETCH_BLOCK - 1 - delta_b; 695 } 696 else 697 { 698 if (delta < 0) 699 return; 700 } 701 702 /* Check whether the two references are likely to hit the same cache 703 line, and how distant the iterations in that it occurs are from 704 each other. */ 705 706 if (step <= PREFETCH_BLOCK) 707 { 708 /* The accesses are sure to meet. Let us check when. */ 709 hit_from = ddown (delta_b, PREFETCH_BLOCK) * PREFETCH_BLOCK; 710 prefetch_before = (hit_from - delta_r + step - 1) / step; 711 712 if (prefetch_before < ref->prefetch_before) 713 ref->prefetch_before = prefetch_before; 714 715 return; 716 } 717 718 /* A more complicated case with step > prefetch_block. First reduce 719 the ratio between the step and the cache line size to its simplest 720 terms. The resulting denominator will then represent the number of 721 distinct iterations after which each address will go back to its 722 initial location within the cache line. This computation assumes 723 that PREFETCH_BLOCK is a power of two. */ 724 prefetch_block = PREFETCH_BLOCK; 725 reduced_prefetch_block = prefetch_block; 726 reduced_step = step; 727 while ((reduced_step & 1) == 0 728 && reduced_prefetch_block > 1) 729 { 730 reduced_step >>= 1; 731 reduced_prefetch_block >>= 1; 732 } 733 734 prefetch_before = delta / step; 735 delta %= step; 736 ref_type = TREE_TYPE (ref->mem); 737 align_unit = TYPE_ALIGN (ref_type) / 8; 738 miss_rate = compute_miss_rate(prefetch_block, step, delta, 739 reduced_prefetch_block, align_unit); 740 if (miss_rate <= ACCEPTABLE_MISS_RATE) 741 { 742 if (prefetch_before < ref->prefetch_before) 743 ref->prefetch_before = prefetch_before; 744 745 return; 746 } 747 748 /* Try also the following iteration. */ 749 prefetch_before++; 750 delta = step - delta; 751 miss_rate = compute_miss_rate(prefetch_block, step, delta, 752 reduced_prefetch_block, align_unit); 753 if (miss_rate <= ACCEPTABLE_MISS_RATE) 754 { 755 if (prefetch_before < ref->prefetch_before) 756 ref->prefetch_before = prefetch_before; 757 758 return; 759 } 760 761 /* The ref probably does not reuse by. */ 762 return; 763 } 764 765 /* Prune the prefetch candidate REF using the reuses with other references 766 in REFS. */ 767 768 static void 769 prune_ref_by_reuse (struct mem_ref *ref, struct mem_ref *refs) 770 { 771 struct mem_ref *prune_by; 772 bool before = true; 773 774 prune_ref_by_self_reuse (ref); 775 776 for (prune_by = refs; prune_by; prune_by = prune_by->next) 777 { 778 if (prune_by == ref) 779 { 780 before = false; 781 continue; 782 } 783 784 if (!WRITE_CAN_USE_READ_PREFETCH 785 && ref->write_p 786 && !prune_by->write_p) 787 continue; 788 if (!READ_CAN_USE_WRITE_PREFETCH 789 && !ref->write_p 790 && prune_by->write_p) 791 continue; 792 793 prune_ref_by_group_reuse (ref, prune_by, before); 794 } 795 } 796 797 /* Prune the prefetch candidates in GROUP using the reuse analysis. */ 798 799 static void 800 prune_group_by_reuse (struct mem_ref_group *group) 801 { 802 struct mem_ref *ref_pruned; 803 804 for (ref_pruned = group->refs; ref_pruned; ref_pruned = ref_pruned->next) 805 { 806 prune_ref_by_reuse (ref_pruned, group->refs); 807 808 if (dump_file && (dump_flags & TDF_DETAILS)) 809 { 810 fprintf (dump_file, "Reference %p:", (void *) ref_pruned); 811 812 if (ref_pruned->prefetch_before == PREFETCH_ALL 813 && ref_pruned->prefetch_mod == 1) 814 fprintf (dump_file, " no restrictions"); 815 else if (ref_pruned->prefetch_before == 0) 816 fprintf (dump_file, " do not prefetch"); 817 else if (ref_pruned->prefetch_before <= ref_pruned->prefetch_mod) 818 fprintf (dump_file, " prefetch once"); 819 else 820 { 821 if (ref_pruned->prefetch_before != PREFETCH_ALL) 822 { 823 fprintf (dump_file, " prefetch before "); 824 fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC, 825 ref_pruned->prefetch_before); 826 } 827 if (ref_pruned->prefetch_mod != 1) 828 { 829 fprintf (dump_file, " prefetch mod "); 830 fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC, 831 ref_pruned->prefetch_mod); 832 } 833 } 834 fprintf (dump_file, "\n"); 835 } 836 } 837 } 838 839 /* Prune the list of prefetch candidates GROUPS using the reuse analysis. */ 840 841 static void 842 prune_by_reuse (struct mem_ref_group *groups) 843 { 844 for (; groups; groups = groups->next) 845 prune_group_by_reuse (groups); 846 } 847 848 /* Returns true if we should issue prefetch for REF. */ 849 850 static bool 851 should_issue_prefetch_p (struct mem_ref *ref) 852 { 853 /* For now do not issue prefetches for only first few of the 854 iterations. */ 855 if (ref->prefetch_before != PREFETCH_ALL) 856 return false; 857 858 /* Do not prefetch nontemporal stores. */ 859 if (ref->storent_p) 860 return false; 861 862 return true; 863 } 864 865 /* Decide which of the prefetch candidates in GROUPS to prefetch. 866 AHEAD is the number of iterations to prefetch ahead (which corresponds 867 to the number of simultaneous instances of one prefetch running at a 868 time). UNROLL_FACTOR is the factor by that the loop is going to be 869 unrolled. Returns true if there is anything to prefetch. */ 870 871 static bool 872 schedule_prefetches (struct mem_ref_group *groups, unsigned unroll_factor, 873 unsigned ahead) 874 { 875 unsigned remaining_prefetch_slots, n_prefetches, prefetch_slots; 876 unsigned slots_per_prefetch; 877 struct mem_ref *ref; 878 bool any = false; 879 880 /* At most SIMULTANEOUS_PREFETCHES should be running at the same time. */ 881 remaining_prefetch_slots = SIMULTANEOUS_PREFETCHES; 882 883 /* The prefetch will run for AHEAD iterations of the original loop, i.e., 884 AHEAD / UNROLL_FACTOR iterations of the unrolled loop. In each iteration, 885 it will need a prefetch slot. */ 886 slots_per_prefetch = (ahead + unroll_factor / 2) / unroll_factor; 887 if (dump_file && (dump_flags & TDF_DETAILS)) 888 fprintf (dump_file, "Each prefetch instruction takes %u prefetch slots.\n", 889 slots_per_prefetch); 890 891 /* For now we just take memory references one by one and issue 892 prefetches for as many as possible. The groups are sorted 893 starting with the largest step, since the references with 894 large step are more likely to cause many cache misses. */ 895 896 for (; groups; groups = groups->next) 897 for (ref = groups->refs; ref; ref = ref->next) 898 { 899 if (!should_issue_prefetch_p (ref)) 900 continue; 901 902 /* If we need to prefetch the reference each PREFETCH_MOD iterations, 903 and we unroll the loop UNROLL_FACTOR times, we need to insert 904 ceil (UNROLL_FACTOR / PREFETCH_MOD) instructions in each 905 iteration. */ 906 n_prefetches = ((unroll_factor + ref->prefetch_mod - 1) 907 / ref->prefetch_mod); 908 prefetch_slots = n_prefetches * slots_per_prefetch; 909 910 /* If more than half of the prefetches would be lost anyway, do not 911 issue the prefetch. */ 912 if (2 * remaining_prefetch_slots < prefetch_slots) 913 continue; 914 915 ref->issue_prefetch_p = true; 916 917 if (remaining_prefetch_slots <= prefetch_slots) 918 return true; 919 remaining_prefetch_slots -= prefetch_slots; 920 any = true; 921 } 922 923 return any; 924 } 925 926 /* Estimate the number of prefetches in the given GROUPS. */ 927 928 static int 929 estimate_prefetch_count (struct mem_ref_group *groups) 930 { 931 struct mem_ref *ref; 932 int prefetch_count = 0; 933 934 for (; groups; groups = groups->next) 935 for (ref = groups->refs; ref; ref = ref->next) 936 if (should_issue_prefetch_p (ref)) 937 prefetch_count++; 938 939 return prefetch_count; 940 } 941 942 /* Issue prefetches for the reference REF into loop as decided before. 943 HEAD is the number of iterations to prefetch ahead. UNROLL_FACTOR 944 is the factor by which LOOP was unrolled. */ 945 946 static void 947 issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead) 948 { 949 HOST_WIDE_INT delta; 950 tree addr, addr_base, write_p, local; 951 gimple prefetch; 952 gimple_stmt_iterator bsi; 953 unsigned n_prefetches, ap; 954 bool nontemporal = ref->reuse_distance >= L2_CACHE_SIZE_BYTES; 955 956 if (dump_file && (dump_flags & TDF_DETAILS)) 957 fprintf (dump_file, "Issued%s prefetch for %p.\n", 958 nontemporal ? " nontemporal" : "", 959 (void *) ref); 960 961 bsi = gsi_for_stmt (ref->stmt); 962 963 n_prefetches = ((unroll_factor + ref->prefetch_mod - 1) 964 / ref->prefetch_mod); 965 addr_base = build_fold_addr_expr_with_type (ref->mem, ptr_type_node); 966 addr_base = force_gimple_operand_gsi (&bsi, unshare_expr (addr_base), 967 true, NULL, true, GSI_SAME_STMT); 968 write_p = ref->write_p ? integer_one_node : integer_zero_node; 969 local = build_int_cst (integer_type_node, nontemporal ? 0 : 3); 970 971 for (ap = 0; ap < n_prefetches; ap++) 972 { 973 /* Determine the address to prefetch. */ 974 delta = (ahead + ap * ref->prefetch_mod) * ref->group->step; 975 addr = fold_build2 (POINTER_PLUS_EXPR, ptr_type_node, 976 addr_base, size_int (delta)); 977 addr = force_gimple_operand_gsi (&bsi, unshare_expr (addr), true, NULL, 978 true, GSI_SAME_STMT); 979 980 /* Create the prefetch instruction. */ 981 prefetch = gimple_build_call (built_in_decls[BUILT_IN_PREFETCH], 982 3, addr, write_p, local); 983 gsi_insert_before (&bsi, prefetch, GSI_SAME_STMT); 984 } 985 } 986 987 /* Issue prefetches for the references in GROUPS into loop as decided before. 988 HEAD is the number of iterations to prefetch ahead. UNROLL_FACTOR is the 989 factor by that LOOP was unrolled. */ 990 991 static void 992 issue_prefetches (struct mem_ref_group *groups, 993 unsigned unroll_factor, unsigned ahead) 994 { 995 struct mem_ref *ref; 996 997 for (; groups; groups = groups->next) 998 for (ref = groups->refs; ref; ref = ref->next) 999 if (ref->issue_prefetch_p) 1000 issue_prefetch_ref (ref, unroll_factor, ahead); 1001 } 1002 1003 /* Returns true if REF is a memory write for that a nontemporal store insn 1004 can be used. */ 1005 1006 static bool 1007 nontemporal_store_p (struct mem_ref *ref) 1008 { 1009 enum machine_mode mode; 1010 enum insn_code code; 1011 1012 /* REF must be a write that is not reused. We require it to be independent 1013 on all other memory references in the loop, as the nontemporal stores may 1014 be reordered with respect to other memory references. */ 1015 if (!ref->write_p 1016 || !ref->independent_p 1017 || ref->reuse_distance < L2_CACHE_SIZE_BYTES) 1018 return false; 1019 1020 /* Check that we have the storent instruction for the mode. */ 1021 mode = TYPE_MODE (TREE_TYPE (ref->mem)); 1022 if (mode == BLKmode) 1023 return false; 1024 1025 code = optab_handler (storent_optab, mode)->insn_code; 1026 return code != CODE_FOR_nothing; 1027 } 1028 1029 /* If REF is a nontemporal store, we mark the corresponding modify statement 1030 and return true. Otherwise, we return false. */ 1031 1032 static bool 1033 mark_nontemporal_store (struct mem_ref *ref) 1034 { 1035 if (!nontemporal_store_p (ref)) 1036 return false; 1037 1038 if (dump_file && (dump_flags & TDF_DETAILS)) 1039 fprintf (dump_file, "Marked reference %p as a nontemporal store.\n", 1040 (void *) ref); 1041 1042 gimple_assign_set_nontemporal_move (ref->stmt, true); 1043 ref->storent_p = true; 1044 1045 return true; 1046 } 1047 1048 /* Issue a memory fence instruction after LOOP. */ 1049 1050 static void 1051 emit_mfence_after_loop (struct loop *loop) 1052 { 1053 VEC (edge, heap) *exits = get_loop_exit_edges (loop); 1054 edge exit; 1055 gimple call; 1056 gimple_stmt_iterator bsi; 1057 unsigned i; 1058 1059 for (i = 0; VEC_iterate (edge, exits, i, exit); i++) 1060 { 1061 call = gimple_build_call (FENCE_FOLLOWING_MOVNT, 0); 1062 1063 if (!single_pred_p (exit->dest) 1064 /* If possible, we prefer not to insert the fence on other paths 1065 in cfg. */ 1066 && !(exit->flags & EDGE_ABNORMAL)) 1067 split_loop_exit_edge (exit); 1068 bsi = gsi_after_labels (exit->dest); 1069 1070 gsi_insert_before (&bsi, call, GSI_NEW_STMT); 1071 mark_virtual_ops_for_renaming (call); 1072 } 1073 1074 VEC_free (edge, heap, exits); 1075 update_ssa (TODO_update_ssa_only_virtuals); 1076 } 1077 1078 /* Returns true if we can use storent in loop, false otherwise. */ 1079 1080 static bool 1081 may_use_storent_in_loop_p (struct loop *loop) 1082 { 1083 bool ret = true; 1084 1085 if (loop->inner != NULL) 1086 return false; 1087 1088 /* If we must issue a mfence insn after using storent, check that there 1089 is a suitable place for it at each of the loop exits. */ 1090 if (FENCE_FOLLOWING_MOVNT != NULL_TREE) 1091 { 1092 VEC (edge, heap) *exits = get_loop_exit_edges (loop); 1093 unsigned i; 1094 edge exit; 1095 1096 for (i = 0; VEC_iterate (edge, exits, i, exit); i++) 1097 if ((exit->flags & EDGE_ABNORMAL) 1098 && exit->dest == EXIT_BLOCK_PTR) 1099 ret = false; 1100 1101 VEC_free (edge, heap, exits); 1102 } 1103 1104 return ret; 1105 } 1106 1107 /* Marks nontemporal stores in LOOP. GROUPS contains the description of memory 1108 references in the loop. */ 1109 1110 static void 1111 mark_nontemporal_stores (struct loop *loop, struct mem_ref_group *groups) 1112 { 1113 struct mem_ref *ref; 1114 bool any = false; 1115 1116 if (!may_use_storent_in_loop_p (loop)) 1117 return; 1118 1119 for (; groups; groups = groups->next) 1120 for (ref = groups->refs; ref; ref = ref->next) 1121 any |= mark_nontemporal_store (ref); 1122 1123 if (any && FENCE_FOLLOWING_MOVNT != NULL_TREE) 1124 emit_mfence_after_loop (loop); 1125 } 1126 1127 /* Determines whether we can profitably unroll LOOP FACTOR times, and if 1128 this is the case, fill in DESC by the description of number of 1129 iterations. */ 1130 1131 static bool 1132 should_unroll_loop_p (struct loop *loop, struct tree_niter_desc *desc, 1133 unsigned factor) 1134 { 1135 if (!can_unroll_loop_p (loop, factor, desc)) 1136 return false; 1137 1138 /* We only consider loops without control flow for unrolling. This is not 1139 a hard restriction -- tree_unroll_loop works with arbitrary loops 1140 as well; but the unrolling/prefetching is usually more profitable for 1141 loops consisting of a single basic block, and we want to limit the 1142 code growth. */ 1143 if (loop->num_nodes > 2) 1144 return false; 1145 1146 return true; 1147 } 1148 1149 /* Determine the coefficient by that unroll LOOP, from the information 1150 contained in the list of memory references REFS. Description of 1151 umber of iterations of LOOP is stored to DESC. NINSNS is the number of 1152 insns of the LOOP. EST_NITER is the estimated number of iterations of 1153 the loop, or -1 if no estimate is available. */ 1154 1155 static unsigned 1156 determine_unroll_factor (struct loop *loop, struct mem_ref_group *refs, 1157 unsigned ninsns, struct tree_niter_desc *desc, 1158 HOST_WIDE_INT est_niter) 1159 { 1160 unsigned upper_bound; 1161 unsigned nfactor, factor, mod_constraint; 1162 struct mem_ref_group *agp; 1163 struct mem_ref *ref; 1164 1165 /* First check whether the loop is not too large to unroll. We ignore 1166 PARAM_MAX_UNROLL_TIMES, because for small loops, it prevented us 1167 from unrolling them enough to make exactly one cache line covered by each 1168 iteration. Also, the goal of PARAM_MAX_UNROLL_TIMES is to prevent 1169 us from unrolling the loops too many times in cases where we only expect 1170 gains from better scheduling and decreasing loop overhead, which is not 1171 the case here. */ 1172 upper_bound = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / ninsns; 1173 1174 /* If we unrolled the loop more times than it iterates, the unrolled version 1175 of the loop would be never entered. */ 1176 if (est_niter >= 0 && est_niter < (HOST_WIDE_INT) upper_bound) 1177 upper_bound = est_niter; 1178 1179 if (upper_bound <= 1) 1180 return 1; 1181 1182 /* Choose the factor so that we may prefetch each cache just once, 1183 but bound the unrolling by UPPER_BOUND. */ 1184 factor = 1; 1185 for (agp = refs; agp; agp = agp->next) 1186 for (ref = agp->refs; ref; ref = ref->next) 1187 if (should_issue_prefetch_p (ref)) 1188 { 1189 mod_constraint = ref->prefetch_mod; 1190 nfactor = least_common_multiple (mod_constraint, factor); 1191 if (nfactor <= upper_bound) 1192 factor = nfactor; 1193 } 1194 1195 if (!should_unroll_loop_p (loop, desc, factor)) 1196 return 1; 1197 1198 return factor; 1199 } 1200 1201 /* Returns the total volume of the memory references REFS, taking into account 1202 reuses in the innermost loop and cache line size. TODO -- we should also 1203 take into account reuses across the iterations of the loops in the loop 1204 nest. */ 1205 1206 static unsigned 1207 volume_of_references (struct mem_ref_group *refs) 1208 { 1209 unsigned volume = 0; 1210 struct mem_ref_group *gr; 1211 struct mem_ref *ref; 1212 1213 for (gr = refs; gr; gr = gr->next) 1214 for (ref = gr->refs; ref; ref = ref->next) 1215 { 1216 /* Almost always reuses another value? */ 1217 if (ref->prefetch_before != PREFETCH_ALL) 1218 continue; 1219 1220 /* If several iterations access the same cache line, use the size of 1221 the line divided by this number. Otherwise, a cache line is 1222 accessed in each iteration. TODO -- in the latter case, we should 1223 take the size of the reference into account, rounding it up on cache 1224 line size multiple. */ 1225 volume += L1_CACHE_LINE_SIZE / ref->prefetch_mod; 1226 } 1227 return volume; 1228 } 1229 1230 /* Returns the volume of memory references accessed across VEC iterations of 1231 loops, whose sizes are described in the LOOP_SIZES array. N is the number 1232 of the loops in the nest (length of VEC and LOOP_SIZES vectors). */ 1233 1234 static unsigned 1235 volume_of_dist_vector (lambda_vector vec, unsigned *loop_sizes, unsigned n) 1236 { 1237 unsigned i; 1238 1239 for (i = 0; i < n; i++) 1240 if (vec[i] != 0) 1241 break; 1242 1243 if (i == n) 1244 return 0; 1245 1246 gcc_assert (vec[i] > 0); 1247 1248 /* We ignore the parts of the distance vector in subloops, since usually 1249 the numbers of iterations are much smaller. */ 1250 return loop_sizes[i] * vec[i]; 1251 } 1252 1253 /* Add the steps of ACCESS_FN multiplied by STRIDE to the array STRIDE 1254 at the position corresponding to the loop of the step. N is the depth 1255 of the considered loop nest, and, LOOP is its innermost loop. */ 1256 1257 static void 1258 add_subscript_strides (tree access_fn, unsigned stride, 1259 HOST_WIDE_INT *strides, unsigned n, struct loop *loop) 1260 { 1261 struct loop *aloop; 1262 tree step; 1263 HOST_WIDE_INT astep; 1264 unsigned min_depth = loop_depth (loop) - n; 1265 1266 while (TREE_CODE (access_fn) == POLYNOMIAL_CHREC) 1267 { 1268 aloop = get_chrec_loop (access_fn); 1269 step = CHREC_RIGHT (access_fn); 1270 access_fn = CHREC_LEFT (access_fn); 1271 1272 if ((unsigned) loop_depth (aloop) <= min_depth) 1273 continue; 1274 1275 if (host_integerp (step, 0)) 1276 astep = tree_low_cst (step, 0); 1277 else 1278 astep = L1_CACHE_LINE_SIZE; 1279 1280 strides[n - 1 - loop_depth (loop) + loop_depth (aloop)] += astep * stride; 1281 1282 } 1283 } 1284 1285 /* Returns the volume of memory references accessed between two consecutive 1286 self-reuses of the reference DR. We consider the subscripts of DR in N 1287 loops, and LOOP_SIZES contains the volumes of accesses in each of the 1288 loops. LOOP is the innermost loop of the current loop nest. */ 1289 1290 static unsigned 1291 self_reuse_distance (data_reference_p dr, unsigned *loop_sizes, unsigned n, 1292 struct loop *loop) 1293 { 1294 tree stride, access_fn; 1295 HOST_WIDE_INT *strides, astride; 1296 VEC (tree, heap) *access_fns; 1297 tree ref = DR_REF (dr); 1298 unsigned i, ret = ~0u; 1299 1300 /* In the following example: 1301 1302 for (i = 0; i < N; i++) 1303 for (j = 0; j < N; j++) 1304 use (a[j][i]); 1305 the same cache line is accessed each N steps (except if the change from 1306 i to i + 1 crosses the boundary of the cache line). Thus, for self-reuse, 1307 we cannot rely purely on the results of the data dependence analysis. 1308 1309 Instead, we compute the stride of the reference in each loop, and consider 1310 the innermost loop in that the stride is less than cache size. */ 1311 1312 strides = XCNEWVEC (HOST_WIDE_INT, n); 1313 access_fns = DR_ACCESS_FNS (dr); 1314 1315 for (i = 0; VEC_iterate (tree, access_fns, i, access_fn); i++) 1316 { 1317 /* Keep track of the reference corresponding to the subscript, so that we 1318 know its stride. */ 1319 while (handled_component_p (ref) && TREE_CODE (ref) != ARRAY_REF) 1320 ref = TREE_OPERAND (ref, 0); 1321 1322 if (TREE_CODE (ref) == ARRAY_REF) 1323 { 1324 stride = TYPE_SIZE_UNIT (TREE_TYPE (ref)); 1325 if (host_integerp (stride, 1)) 1326 astride = tree_low_cst (stride, 1); 1327 else 1328 astride = L1_CACHE_LINE_SIZE; 1329 1330 ref = TREE_OPERAND (ref, 0); 1331 } 1332 else 1333 astride = 1; 1334 1335 add_subscript_strides (access_fn, astride, strides, n, loop); 1336 } 1337 1338 for (i = n; i-- > 0; ) 1339 { 1340 unsigned HOST_WIDE_INT s; 1341 1342 s = strides[i] < 0 ? -strides[i] : strides[i]; 1343 1344 if (s < (unsigned) L1_CACHE_LINE_SIZE 1345 && (loop_sizes[i] 1346 > (unsigned) (L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION))) 1347 { 1348 ret = loop_sizes[i]; 1349 break; 1350 } 1351 } 1352 1353 free (strides); 1354 return ret; 1355 } 1356 1357 /* Determines the distance till the first reuse of each reference in REFS 1358 in the loop nest of LOOP. NO_OTHER_REFS is true if there are no other 1359 memory references in the loop. */ 1360 1361 static void 1362 determine_loop_nest_reuse (struct loop *loop, struct mem_ref_group *refs, 1363 bool no_other_refs) 1364 { 1365 struct loop *nest, *aloop; 1366 VEC (data_reference_p, heap) *datarefs = NULL; 1367 VEC (ddr_p, heap) *dependences = NULL; 1368 struct mem_ref_group *gr; 1369 struct mem_ref *ref, *refb; 1370 VEC (loop_p, heap) *vloops = NULL; 1371 unsigned *loop_data_size; 1372 unsigned i, j, n; 1373 unsigned volume, dist, adist; 1374 HOST_WIDE_INT vol; 1375 data_reference_p dr; 1376 ddr_p dep; 1377 1378 if (loop->inner) 1379 return; 1380 1381 /* Find the outermost loop of the loop nest of loop (we require that 1382 there are no sibling loops inside the nest). */ 1383 nest = loop; 1384 while (1) 1385 { 1386 aloop = loop_outer (nest); 1387 1388 if (aloop == current_loops->tree_root 1389 || aloop->inner->next) 1390 break; 1391 1392 nest = aloop; 1393 } 1394 1395 /* For each loop, determine the amount of data accessed in each iteration. 1396 We use this to estimate whether the reference is evicted from the 1397 cache before its reuse. */ 1398 find_loop_nest (nest, &vloops); 1399 n = VEC_length (loop_p, vloops); 1400 loop_data_size = XNEWVEC (unsigned, n); 1401 volume = volume_of_references (refs); 1402 i = n; 1403 while (i-- != 0) 1404 { 1405 loop_data_size[i] = volume; 1406 /* Bound the volume by the L2 cache size, since above this bound, 1407 all dependence distances are equivalent. */ 1408 if (volume > L2_CACHE_SIZE_BYTES) 1409 continue; 1410 1411 aloop = VEC_index (loop_p, vloops, i); 1412 vol = estimated_loop_iterations_int (aloop, false); 1413 if (vol < 0) 1414 vol = expected_loop_iterations (aloop); 1415 volume *= vol; 1416 } 1417 1418 /* Prepare the references in the form suitable for data dependence 1419 analysis. We ignore unanalyzable data references (the results 1420 are used just as a heuristics to estimate temporality of the 1421 references, hence we do not need to worry about correctness). */ 1422 for (gr = refs; gr; gr = gr->next) 1423 for (ref = gr->refs; ref; ref = ref->next) 1424 { 1425 dr = create_data_ref (nest, ref->mem, ref->stmt, !ref->write_p); 1426 1427 if (dr) 1428 { 1429 ref->reuse_distance = volume; 1430 dr->aux = ref; 1431 VEC_safe_push (data_reference_p, heap, datarefs, dr); 1432 } 1433 else 1434 no_other_refs = false; 1435 } 1436 1437 for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++) 1438 { 1439 dist = self_reuse_distance (dr, loop_data_size, n, loop); 1440 ref = (struct mem_ref *) dr->aux; 1441 if (ref->reuse_distance > dist) 1442 ref->reuse_distance = dist; 1443 1444 if (no_other_refs) 1445 ref->independent_p = true; 1446 } 1447 1448 compute_all_dependences (datarefs, &dependences, vloops, true); 1449 1450 for (i = 0; VEC_iterate (ddr_p, dependences, i, dep); i++) 1451 { 1452 if (DDR_ARE_DEPENDENT (dep) == chrec_known) 1453 continue; 1454 1455 ref = (struct mem_ref *) DDR_A (dep)->aux; 1456 refb = (struct mem_ref *) DDR_B (dep)->aux; 1457 1458 if (DDR_ARE_DEPENDENT (dep) == chrec_dont_know 1459 || DDR_NUM_DIST_VECTS (dep) == 0) 1460 { 1461 /* If the dependence cannot be analyzed, assume that there might be 1462 a reuse. */ 1463 dist = 0; 1464 1465 ref->independent_p = false; 1466 refb->independent_p = false; 1467 } 1468 else 1469 { 1470 /* The distance vectors are normalized to be always lexicographically 1471 positive, hence we cannot tell just from them whether DDR_A comes 1472 before DDR_B or vice versa. However, it is not important, 1473 anyway -- if DDR_A is close to DDR_B, then it is either reused in 1474 DDR_B (and it is not nontemporal), or it reuses the value of DDR_B 1475 in cache (and marking it as nontemporal would not affect 1476 anything). */ 1477 1478 dist = volume; 1479 for (j = 0; j < DDR_NUM_DIST_VECTS (dep); j++) 1480 { 1481 adist = volume_of_dist_vector (DDR_DIST_VECT (dep, j), 1482 loop_data_size, n); 1483 1484 /* If this is a dependence in the innermost loop (i.e., the 1485 distances in all superloops are zero) and it is not 1486 the trivial self-dependence with distance zero, record that 1487 the references are not completely independent. */ 1488 if (lambda_vector_zerop (DDR_DIST_VECT (dep, j), n - 1) 1489 && (ref != refb 1490 || DDR_DIST_VECT (dep, j)[n-1] != 0)) 1491 { 1492 ref->independent_p = false; 1493 refb->independent_p = false; 1494 } 1495 1496 /* Ignore accesses closer than 1497 L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION, 1498 so that we use nontemporal prefetches e.g. if single memory 1499 location is accessed several times in a single iteration of 1500 the loop. */ 1501 if (adist < L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION) 1502 continue; 1503 1504 if (adist < dist) 1505 dist = adist; 1506 } 1507 } 1508 1509 if (ref->reuse_distance > dist) 1510 ref->reuse_distance = dist; 1511 if (refb->reuse_distance > dist) 1512 refb->reuse_distance = dist; 1513 } 1514 1515 free_dependence_relations (dependences); 1516 free_data_refs (datarefs); 1517 free (loop_data_size); 1518 1519 if (dump_file && (dump_flags & TDF_DETAILS)) 1520 { 1521 fprintf (dump_file, "Reuse distances:\n"); 1522 for (gr = refs; gr; gr = gr->next) 1523 for (ref = gr->refs; ref; ref = ref->next) 1524 fprintf (dump_file, " ref %p distance %u\n", 1525 (void *) ref, ref->reuse_distance); 1526 } 1527 } 1528 1529 /* Do a cost-benefit analysis to determine if prefetching is profitable 1530 for the current loop given the following parameters: 1531 AHEAD: the iteration ahead distance, 1532 EST_NITER: the estimated trip count, 1533 NINSNS: estimated number of instructions in the loop, 1534 PREFETCH_COUNT: an estimate of the number of prefetches 1535 MEM_REF_COUNT: total number of memory references in the loop. */ 1536 1537 static bool 1538 is_loop_prefetching_profitable (unsigned ahead, HOST_WIDE_INT est_niter, 1539 unsigned ninsns, unsigned prefetch_count, 1540 unsigned mem_ref_count) 1541 { 1542 int insn_to_mem_ratio, insn_to_prefetch_ratio; 1543 1544 if (mem_ref_count == 0) 1545 return false; 1546 1547 /* Prefetching improves performance by overlapping cache missing 1548 memory accesses with CPU operations. If the loop does not have 1549 enough CPU operations to overlap with memory operations, prefetching 1550 won't give a significant benefit. One approximate way of checking 1551 this is to require the ratio of instructions to memory references to 1552 be above a certain limit. This approximation works well in practice. 1553 TODO: Implement a more precise computation by estimating the time 1554 for each CPU or memory op in the loop. Time estimates for memory ops 1555 should account for cache misses. */ 1556 insn_to_mem_ratio = ninsns / mem_ref_count; 1557 1558 if (insn_to_mem_ratio < PREFETCH_MIN_INSN_TO_MEM_RATIO) 1559 return false; 1560 1561 /* Profitability of prefetching is highly dependent on the trip count. 1562 For a given AHEAD distance, the first AHEAD iterations do not benefit 1563 from prefetching, and the last AHEAD iterations execute useless 1564 prefetches. So, if the trip count is not large enough relative to AHEAD, 1565 prefetching may cause serious performance degradation. To avoid this 1566 problem when the trip count is not known at compile time, we 1567 conservatively skip loops with high prefetching costs. For now, only 1568 the I-cache cost is considered. The relative I-cache cost is estimated 1569 by taking the ratio between the number of prefetches and the total 1570 number of instructions. Since we are using integer arithmetic, we 1571 compute the reciprocal of this ratio. 1572 TODO: Account for loop unrolling, which may reduce the costs of 1573 shorter stride prefetches. Note that not accounting for loop 1574 unrolling over-estimates the cost and hence gives more conservative 1575 results. */ 1576 if (est_niter < 0) 1577 { 1578 insn_to_prefetch_ratio = ninsns / prefetch_count; 1579 return insn_to_prefetch_ratio >= MIN_INSN_TO_PREFETCH_RATIO; 1580 } 1581 1582 if (est_niter <= (HOST_WIDE_INT) ahead) 1583 { 1584 if (dump_file && (dump_flags & TDF_DETAILS)) 1585 fprintf (dump_file, 1586 "Not prefetching -- loop estimated to roll only %d times\n", 1587 (int) est_niter); 1588 return false; 1589 } 1590 return true; 1591 } 1592 1593 1594 /* Issue prefetch instructions for array references in LOOP. Returns 1595 true if the LOOP was unrolled. */ 1596 1597 static bool 1598 loop_prefetch_arrays (struct loop *loop) 1599 { 1600 struct mem_ref_group *refs; 1601 unsigned ahead, ninsns, time, unroll_factor; 1602 HOST_WIDE_INT est_niter; 1603 struct tree_niter_desc desc; 1604 bool unrolled = false, no_other_refs; 1605 unsigned prefetch_count; 1606 unsigned mem_ref_count; 1607 1608 if (optimize_loop_nest_for_size_p (loop)) 1609 { 1610 if (dump_file && (dump_flags & TDF_DETAILS)) 1611 fprintf (dump_file, " ignored (cold area)\n"); 1612 return false; 1613 } 1614 1615 /* Step 1: gather the memory references. */ 1616 refs = gather_memory_references (loop, &no_other_refs, &mem_ref_count); 1617 1618 /* Step 2: estimate the reuse effects. */ 1619 prune_by_reuse (refs); 1620 1621 prefetch_count = estimate_prefetch_count (refs); 1622 if (prefetch_count == 0) 1623 goto fail; 1624 1625 determine_loop_nest_reuse (loop, refs, no_other_refs); 1626 1627 /* Step 3: determine the ahead and unroll factor. */ 1628 1629 /* FIXME: the time should be weighted by the probabilities of the blocks in 1630 the loop body. */ 1631 time = tree_num_loop_insns (loop, &eni_time_weights); 1632 ahead = (PREFETCH_LATENCY + time - 1) / time; 1633 est_niter = estimated_loop_iterations_int (loop, false); 1634 1635 ninsns = tree_num_loop_insns (loop, &eni_size_weights); 1636 unroll_factor = determine_unroll_factor (loop, refs, ninsns, &desc, 1637 est_niter); 1638 if (dump_file && (dump_flags & TDF_DETAILS)) 1639 fprintf (dump_file, "Ahead %d, unroll factor %d, trip count " 1640 HOST_WIDE_INT_PRINT_DEC "\n" 1641 "insn count %d, mem ref count %d, prefetch count %d\n", 1642 ahead, unroll_factor, est_niter, 1643 ninsns, mem_ref_count, prefetch_count); 1644 1645 if (!is_loop_prefetching_profitable (ahead, est_niter, ninsns, 1646 prefetch_count, mem_ref_count)) 1647 goto fail; 1648 1649 mark_nontemporal_stores (loop, refs); 1650 1651 /* Step 4: what to prefetch? */ 1652 if (!schedule_prefetches (refs, unroll_factor, ahead)) 1653 goto fail; 1654 1655 /* Step 5: unroll the loop. TODO -- peeling of first and last few 1656 iterations so that we do not issue superfluous prefetches. */ 1657 if (unroll_factor != 1) 1658 { 1659 tree_unroll_loop (loop, unroll_factor, 1660 single_dom_exit (loop), &desc); 1661 unrolled = true; 1662 } 1663 1664 /* Step 6: issue the prefetches. */ 1665 issue_prefetches (refs, unroll_factor, ahead); 1666 1667 fail: 1668 release_mem_refs (refs); 1669 return unrolled; 1670 } 1671 1672 /* Issue prefetch instructions for array references in loops. */ 1673 1674 unsigned int 1675 tree_ssa_prefetch_arrays (void) 1676 { 1677 loop_iterator li; 1678 struct loop *loop; 1679 bool unrolled = false; 1680 int todo_flags = 0; 1681 1682 if (!HAVE_prefetch 1683 /* It is possible to ask compiler for say -mtune=i486 -march=pentium4. 1684 -mtune=i486 causes us having PREFETCH_BLOCK 0, since this is part 1685 of processor costs and i486 does not have prefetch, but 1686 -march=pentium4 causes HAVE_prefetch to be true. Ugh. */ 1687 || PREFETCH_BLOCK == 0) 1688 return 0; 1689 1690 if (dump_file && (dump_flags & TDF_DETAILS)) 1691 { 1692 fprintf (dump_file, "Prefetching parameters:\n"); 1693 fprintf (dump_file, " simultaneous prefetches: %d\n", 1694 SIMULTANEOUS_PREFETCHES); 1695 fprintf (dump_file, " prefetch latency: %d\n", PREFETCH_LATENCY); 1696 fprintf (dump_file, " prefetch block size: %d\n", PREFETCH_BLOCK); 1697 fprintf (dump_file, " L1 cache size: %d lines, %d kB\n", 1698 L1_CACHE_SIZE_BYTES / L1_CACHE_LINE_SIZE, L1_CACHE_SIZE); 1699 fprintf (dump_file, " L1 cache line size: %d\n", L1_CACHE_LINE_SIZE); 1700 fprintf (dump_file, " L2 cache size: %d kB\n", L2_CACHE_SIZE); 1701 fprintf (dump_file, " min insn-to-prefetch ratio: %d \n", 1702 MIN_INSN_TO_PREFETCH_RATIO); 1703 fprintf (dump_file, " min insn-to-mem ratio: %d \n", 1704 PREFETCH_MIN_INSN_TO_MEM_RATIO); 1705 fprintf (dump_file, "\n"); 1706 } 1707 1708 initialize_original_copy_tables (); 1709 1710 if (!built_in_decls[BUILT_IN_PREFETCH]) 1711 { 1712 tree type = build_function_type (void_type_node, 1713 tree_cons (NULL_TREE, 1714 const_ptr_type_node, 1715 NULL_TREE)); 1716 tree decl = add_builtin_function ("__builtin_prefetch", type, 1717 BUILT_IN_PREFETCH, BUILT_IN_NORMAL, 1718 NULL, NULL_TREE); 1719 DECL_IS_NOVOPS (decl) = true; 1720 built_in_decls[BUILT_IN_PREFETCH] = decl; 1721 } 1722 1723 /* We assume that size of cache line is a power of two, so verify this 1724 here. */ 1725 gcc_assert ((PREFETCH_BLOCK & (PREFETCH_BLOCK - 1)) == 0); 1726 1727 FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST) 1728 { 1729 if (dump_file && (dump_flags & TDF_DETAILS)) 1730 fprintf (dump_file, "Processing loop %d:\n", loop->num); 1731 1732 unrolled |= loop_prefetch_arrays (loop); 1733 1734 if (dump_file && (dump_flags & TDF_DETAILS)) 1735 fprintf (dump_file, "\n\n"); 1736 } 1737 1738 if (unrolled) 1739 { 1740 scev_reset (); 1741 todo_flags |= TODO_cleanup_cfg; 1742 } 1743 1744 free_original_copy_tables (); 1745 return todo_flags; 1746 } 1747