1 /* Subroutines used to expand string and block move, clear, 2 compare and other operations for PowerPC. 3 Copyright (C) 1991-2019 Free Software Foundation, Inc. 4 5 This file is part of GCC. 6 7 GCC is free software; you can redistribute it and/or modify it 8 under the terms of the GNU General Public License as published 9 by the Free Software Foundation; either version 3, or (at your 10 option) any later version. 11 12 GCC is distributed in the hope that it will be useful, but WITHOUT 13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public 15 License for more details. 16 17 You should have received a copy of the GNU General Public License 18 along with GCC; see the file COPYING3. If not see 19 <http://www.gnu.org/licenses/>. */ 20 21 #define IN_TARGET_CODE 1 22 23 #include "config.h" 24 #include "system.h" 25 #include "coretypes.h" 26 #include "backend.h" 27 #include "rtl.h" 28 #include "tree.h" 29 #include "memmodel.h" 30 #include "tm_p.h" 31 #include "ira.h" 32 #include "print-tree.h" 33 #include "varasm.h" 34 #include "explow.h" 35 #include "expr.h" 36 #include "output.h" 37 #include "target.h" 38 #include "profile-count.h" 39 #include "predict.h" 40 41 /* Expand a block clear operation, and return 1 if successful. Return 0 42 if we should let the compiler generate normal code. 43 44 operands[0] is the destination 45 operands[1] is the length 46 operands[3] is the alignment */ 47 48 int 49 expand_block_clear (rtx operands[]) 50 { 51 rtx orig_dest = operands[0]; 52 rtx bytes_rtx = operands[1]; 53 rtx align_rtx = operands[3]; 54 bool constp = CONST_INT_P (bytes_rtx); 55 HOST_WIDE_INT align; 56 HOST_WIDE_INT bytes; 57 int offset; 58 int clear_bytes; 59 int clear_step; 60 61 /* If this is not a fixed size move, just call memcpy */ 62 if (! constp) 63 return 0; 64 65 /* This must be a fixed size alignment */ 66 gcc_assert (CONST_INT_P (align_rtx)); 67 align = INTVAL (align_rtx) * BITS_PER_UNIT; 68 69 /* Anything to clear? */ 70 bytes = INTVAL (bytes_rtx); 71 if (bytes <= 0) 72 return 1; 73 74 /* Use the builtin memset after a point, to avoid huge code bloat. 75 When optimize_size, avoid any significant code bloat; calling 76 memset is about 4 instructions, so allow for one instruction to 77 load zero and three to do clearing. */ 78 if (TARGET_ALTIVEC && (align >= 128 || TARGET_EFFICIENT_UNALIGNED_VSX)) 79 clear_step = 16; 80 else if (TARGET_POWERPC64 && (align >= 64 || !STRICT_ALIGNMENT)) 81 clear_step = 8; 82 else 83 clear_step = 4; 84 85 if (optimize_size && bytes > 3 * clear_step) 86 return 0; 87 if (! optimize_size && bytes > 8 * clear_step) 88 return 0; 89 90 bool unaligned_vsx_ok = (bytes >= 32 && TARGET_EFFICIENT_UNALIGNED_VSX); 91 92 for (offset = 0; bytes > 0; offset += clear_bytes, bytes -= clear_bytes) 93 { 94 machine_mode mode = BLKmode; 95 rtx dest; 96 97 if (TARGET_ALTIVEC 98 && (bytes >= 16 && (align >= 128 || unaligned_vsx_ok))) 99 { 100 clear_bytes = 16; 101 mode = V4SImode; 102 } 103 else if (bytes >= 8 && TARGET_POWERPC64 104 && (align >= 64 || !STRICT_ALIGNMENT)) 105 { 106 clear_bytes = 8; 107 mode = DImode; 108 if (offset == 0 && align < 64) 109 { 110 rtx addr; 111 112 /* If the address form is reg+offset with offset not a 113 multiple of four, reload into reg indirect form here 114 rather than waiting for reload. This way we get one 115 reload, not one per store. */ 116 addr = XEXP (orig_dest, 0); 117 if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM) 118 && CONST_INT_P (XEXP (addr, 1)) 119 && (INTVAL (XEXP (addr, 1)) & 3) != 0) 120 { 121 addr = copy_addr_to_reg (addr); 122 orig_dest = replace_equiv_address (orig_dest, addr); 123 } 124 } 125 } 126 else if (bytes >= 4 && (align >= 32 || !STRICT_ALIGNMENT)) 127 { /* move 4 bytes */ 128 clear_bytes = 4; 129 mode = SImode; 130 } 131 else if (bytes >= 2 && (align >= 16 || !STRICT_ALIGNMENT)) 132 { /* move 2 bytes */ 133 clear_bytes = 2; 134 mode = HImode; 135 } 136 else /* move 1 byte at a time */ 137 { 138 clear_bytes = 1; 139 mode = QImode; 140 } 141 142 dest = adjust_address (orig_dest, mode, offset); 143 144 emit_move_insn (dest, CONST0_RTX (mode)); 145 } 146 147 return 1; 148 } 149 150 /* Figure out the correct instructions to generate to load data for 151 block compare. MODE is used for the read from memory, and 152 data is zero extended if REG is wider than MODE. If LE code 153 is being generated, bswap loads are used. 154 155 REG is the destination register to move the data into. 156 MEM is the memory block being read. 157 MODE is the mode of memory to use for the read. */ 158 static void 159 do_load_for_compare (rtx reg, rtx mem, machine_mode mode) 160 { 161 switch (GET_MODE (reg)) 162 { 163 case E_V16QImode: 164 switch (mode) 165 { 166 case E_V16QImode: 167 if (!BYTES_BIG_ENDIAN) 168 { 169 if (TARGET_P9_VECTOR) 170 emit_insn (gen_vsx_ld_elemrev_v16qi_internal (reg, mem)); 171 else 172 { 173 rtx reg_v2di = simplify_gen_subreg (V2DImode, reg, 174 V16QImode, 0); 175 gcc_assert (MEM_P (mem)); 176 rtx addr = XEXP (mem, 0); 177 rtx mem_v2di = gen_rtx_MEM (V2DImode, addr); 178 MEM_COPY_ATTRIBUTES (mem_v2di, mem); 179 set_mem_size (mem, GET_MODE_SIZE (V2DImode)); 180 emit_insn (gen_vsx_ld_elemrev_v2di (reg_v2di, mem_v2di)); 181 } 182 } 183 else 184 emit_insn (gen_vsx_movv2di_64bit (reg, mem)); 185 break; 186 default: 187 gcc_unreachable (); 188 } 189 break; 190 case E_DImode: 191 switch (mode) 192 { 193 case E_QImode: 194 emit_insn (gen_zero_extendqidi2 (reg, mem)); 195 break; 196 case E_HImode: 197 { 198 rtx src = mem; 199 if (!BYTES_BIG_ENDIAN) 200 { 201 src = gen_reg_rtx (HImode); 202 emit_insn (gen_bswaphi2 (src, mem)); 203 } 204 emit_insn (gen_zero_extendhidi2 (reg, src)); 205 break; 206 } 207 case E_SImode: 208 { 209 rtx src = mem; 210 if (!BYTES_BIG_ENDIAN) 211 { 212 src = gen_reg_rtx (SImode); 213 emit_insn (gen_bswapsi2 (src, mem)); 214 } 215 emit_insn (gen_zero_extendsidi2 (reg, src)); 216 } 217 break; 218 case E_DImode: 219 if (!BYTES_BIG_ENDIAN) 220 emit_insn (gen_bswapdi2 (reg, mem)); 221 else 222 emit_insn (gen_movdi (reg, mem)); 223 break; 224 default: 225 gcc_unreachable (); 226 } 227 break; 228 229 case E_SImode: 230 switch (mode) 231 { 232 case E_QImode: 233 emit_insn (gen_zero_extendqisi2 (reg, mem)); 234 break; 235 case E_HImode: 236 { 237 rtx src = mem; 238 if (!BYTES_BIG_ENDIAN) 239 { 240 src = gen_reg_rtx (HImode); 241 emit_insn (gen_bswaphi2 (src, mem)); 242 } 243 emit_insn (gen_zero_extendhisi2 (reg, src)); 244 break; 245 } 246 case E_SImode: 247 if (!BYTES_BIG_ENDIAN) 248 emit_insn (gen_bswapsi2 (reg, mem)); 249 else 250 emit_insn (gen_movsi (reg, mem)); 251 break; 252 case E_DImode: 253 /* DImode is larger than the destination reg so is not expected. */ 254 gcc_unreachable (); 255 break; 256 default: 257 gcc_unreachable (); 258 } 259 break; 260 261 case E_QImode: 262 gcc_assert (mode == E_QImode); 263 emit_move_insn (reg, mem); 264 break; 265 266 default: 267 gcc_unreachable (); 268 break; 269 } 270 } 271 272 /* Select the mode to be used for reading the next chunk of bytes 273 in the compare. 274 275 OFFSET is the current read offset from the beginning of the block. 276 BYTES is the number of bytes remaining to be read. 277 ALIGN is the minimum alignment of the memory blocks being compared in bytes. */ 278 static machine_mode 279 select_block_compare_mode (unsigned HOST_WIDE_INT offset, 280 unsigned HOST_WIDE_INT bytes, 281 unsigned HOST_WIDE_INT align) 282 { 283 /* First see if we can do a whole load unit 284 as that will be more efficient than a larger load + shift. */ 285 286 /* If big, use biggest chunk. 287 If exactly chunk size, use that size. 288 If remainder can be done in one piece with shifting, do that. 289 Do largest chunk possible without violating alignment rules. */ 290 291 /* The most we can read without potential page crossing. */ 292 unsigned HOST_WIDE_INT maxread = ROUND_UP (bytes, align); 293 294 /* If we have an LE target without ldbrx and word_mode is DImode, 295 then we must avoid using word_mode. */ 296 int word_mode_ok = !(!BYTES_BIG_ENDIAN && !TARGET_LDBRX 297 && word_mode == DImode); 298 299 if (word_mode_ok && bytes >= UNITS_PER_WORD) 300 return word_mode; 301 else if (bytes == GET_MODE_SIZE (SImode)) 302 return SImode; 303 else if (bytes == GET_MODE_SIZE (HImode)) 304 return HImode; 305 else if (bytes == GET_MODE_SIZE (QImode)) 306 return QImode; 307 else if (bytes < GET_MODE_SIZE (SImode) 308 && TARGET_EFFICIENT_OVERLAPPING_UNALIGNED 309 && offset >= GET_MODE_SIZE (SImode) - bytes) 310 /* This matches the case were we have SImode and 3 bytes 311 and offset >= 1 and permits us to move back one and overlap 312 with the previous read, thus avoiding having to shift 313 unwanted bytes off of the input. */ 314 return SImode; 315 else if (word_mode_ok && bytes < UNITS_PER_WORD 316 && TARGET_EFFICIENT_OVERLAPPING_UNALIGNED 317 && offset >= UNITS_PER_WORD-bytes) 318 /* Similarly, if we can use DImode it will get matched here and 319 can do an overlapping read that ends at the end of the block. */ 320 return word_mode; 321 else if (word_mode_ok && maxread >= UNITS_PER_WORD) 322 /* It is safe to do all remaining in one load of largest size, 323 possibly with a shift to get rid of unwanted bytes. */ 324 return word_mode; 325 else if (maxread >= GET_MODE_SIZE (SImode)) 326 /* It is safe to do all remaining in one SImode load, 327 possibly with a shift to get rid of unwanted bytes. */ 328 return SImode; 329 else if (bytes > GET_MODE_SIZE (SImode)) 330 return SImode; 331 else if (bytes > GET_MODE_SIZE (HImode)) 332 return HImode; 333 334 /* final fallback is do one byte */ 335 return QImode; 336 } 337 338 /* Compute the alignment of pointer+OFFSET where the original alignment 339 of pointer was BASE_ALIGN. */ 340 static unsigned HOST_WIDE_INT 341 compute_current_alignment (unsigned HOST_WIDE_INT base_align, 342 unsigned HOST_WIDE_INT offset) 343 { 344 if (offset == 0) 345 return base_align; 346 return MIN (base_align, offset & -offset); 347 } 348 349 /* Prepare address and then do a load. 350 351 MODE is the mode to use for the load. 352 DEST is the destination register for the data. 353 ADDR is the address to be loaded. 354 ORIG_ADDR is the original address expression. */ 355 static void 356 do_load_for_compare_from_addr (machine_mode mode, rtx dest, rtx addr, 357 rtx orig_addr) 358 { 359 rtx mem = gen_rtx_MEM (mode, addr); 360 MEM_COPY_ATTRIBUTES (mem, orig_addr); 361 set_mem_size (mem, GET_MODE_SIZE (mode)); 362 do_load_for_compare (dest, mem, mode); 363 return; 364 } 365 366 /* Do a branch for an if/else decision. 367 368 CMPMODE is the mode to use for the comparison. 369 COMPARISON is the rtx code for the compare needed. 370 A is the first thing to be compared. 371 B is the second thing to be compared. 372 CR is the condition code reg input, or NULL_RTX. 373 TRUE_LABEL is the label to branch to if the condition is true. 374 P is the estimated branch probability for the branch. 375 376 The return value is the CR used for the comparison. 377 If CR is null_rtx, then a new register of CMPMODE is generated. 378 If A and B are both null_rtx, then CR must not be null, and the 379 compare is not generated so you can use this with a dot form insn. */ 380 381 static void 382 do_ifelse (machine_mode cmpmode, rtx_code comparison, 383 rtx a, rtx b, rtx cr, rtx true_label, profile_probability br_prob) 384 { 385 gcc_assert ((a == NULL_RTX && b == NULL_RTX && cr != NULL_RTX) 386 || (a != NULL_RTX && b != NULL_RTX)); 387 388 if (cr != NULL_RTX) 389 gcc_assert (GET_MODE (cr) == cmpmode); 390 else 391 cr = gen_reg_rtx (cmpmode); 392 393 rtx label_ref = gen_rtx_LABEL_REF (VOIDmode, true_label); 394 395 if (a != NULL_RTX) 396 emit_move_insn (cr, gen_rtx_COMPARE (cmpmode, a, b)); 397 398 rtx cmp_rtx = gen_rtx_fmt_ee (comparison, VOIDmode, cr, const0_rtx); 399 400 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx, label_ref, pc_rtx); 401 rtx_insn *j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse)); 402 add_reg_br_prob_note (j, br_prob); 403 JUMP_LABEL (j) = true_label; 404 LABEL_NUSES (true_label) += 1; 405 } 406 407 /* Emit an isel of the proper mode for DEST. 408 409 DEST is the isel destination register. 410 SRC1 is the isel source if CR is true. 411 SRC2 is the isel source if CR is false. 412 CR is the condition for the isel. */ 413 static void 414 do_isel (rtx dest, rtx cmp, rtx src_t, rtx src_f, rtx cr) 415 { 416 if (GET_MODE (dest) == DImode) 417 emit_insn (gen_isel_signed_di (dest, cmp, src_t, src_f, cr)); 418 else 419 emit_insn (gen_isel_signed_si (dest, cmp, src_t, src_f, cr)); 420 } 421 422 /* Emit a subtract of the proper mode for DEST. 423 424 DEST is the destination register for the subtract. 425 SRC1 is the first subtract input. 426 SRC2 is the second subtract input. 427 428 Computes DEST = SRC1-SRC2. */ 429 static void 430 do_sub3 (rtx dest, rtx src1, rtx src2) 431 { 432 if (GET_MODE (dest) == DImode) 433 emit_insn (gen_subdi3 (dest, src1, src2)); 434 else 435 emit_insn (gen_subsi3 (dest, src1, src2)); 436 } 437 438 /* Emit an add of the proper mode for DEST. 439 440 DEST is the destination register for the add. 441 SRC1 is the first add input. 442 SRC2 is the second add input. 443 444 Computes DEST = SRC1+SRC2. */ 445 static void 446 do_add3 (rtx dest, rtx src1, rtx src2) 447 { 448 if (GET_MODE (dest) == DImode) 449 emit_insn (gen_adddi3 (dest, src1, src2)); 450 else 451 emit_insn (gen_addsi3 (dest, src1, src2)); 452 } 453 454 /* Emit an and of the proper mode for DEST. 455 456 DEST is the destination register for the and. 457 SRC1 is the first and input. 458 SRC2 is the second and input. 459 460 Computes DEST = SRC1&SRC2. */ 461 static void 462 do_and3 (rtx dest, rtx src1, rtx src2) 463 { 464 if (GET_MODE (dest) == DImode) 465 emit_insn (gen_anddi3 (dest, src1, src2)); 466 else 467 emit_insn (gen_andsi3 (dest, src1, src2)); 468 } 469 470 /* Emit an cmpb of the proper mode for DEST. 471 472 DEST is the destination register for the cmpb. 473 SRC1 is the first input. 474 SRC2 is the second input. 475 476 Computes cmpb of SRC1, SRC2. */ 477 static void 478 do_cmpb3 (rtx dest, rtx src1, rtx src2) 479 { 480 if (GET_MODE (dest) == DImode) 481 emit_insn (gen_cmpbdi3 (dest, src1, src2)); 482 else 483 emit_insn (gen_cmpbsi3 (dest, src1, src2)); 484 } 485 486 /* Emit a rotl of the proper mode for DEST. 487 488 DEST is the destination register for the and. 489 SRC1 is the first and input. 490 SRC2 is the second and input. 491 492 Computes DEST = SRC1 rotated left by SRC2. */ 493 static void 494 do_rotl3 (rtx dest, rtx src1, rtx src2) 495 { 496 if (GET_MODE (dest) == DImode) 497 emit_insn (gen_rotldi3 (dest, src1, src2)); 498 else 499 emit_insn (gen_rotlsi3 (dest, src1, src2)); 500 } 501 502 /* Generate rtl for a load, shift, and compare of less than a full word. 503 504 LOAD_MODE is the machine mode for the loads. 505 DIFF is the reg for the difference. 506 CMP_REM is the reg containing the remaining bytes to compare. 507 DCOND is the CCUNS reg for the compare if we are doing P9 code with setb. 508 SRC1_ADDR is the first source address. 509 SRC2_ADDR is the second source address. 510 ORIG_SRC1 is the original first source block's address rtx. 511 ORIG_SRC2 is the original second source block's address rtx. */ 512 static void 513 do_load_mask_compare (const machine_mode load_mode, rtx diff, rtx cmp_rem, rtx dcond, 514 rtx src1_addr, rtx src2_addr, rtx orig_src1, rtx orig_src2) 515 { 516 HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode); 517 rtx shift_amount = gen_reg_rtx (word_mode); 518 rtx d1 = gen_reg_rtx (word_mode); 519 rtx d2 = gen_reg_rtx (word_mode); 520 521 do_load_for_compare_from_addr (load_mode, d1, src1_addr, orig_src1); 522 do_load_for_compare_from_addr (load_mode, d2, src2_addr, orig_src2); 523 do_sub3 (shift_amount, GEN_INT (load_mode_size), cmp_rem); 524 525 if (word_mode == DImode) 526 { 527 emit_insn (gen_ashldi3 (shift_amount, shift_amount, 528 GEN_INT (LOG2_BITS_PER_UNIT))); 529 emit_insn (gen_lshrdi3 (d1, d1, 530 gen_lowpart (SImode, shift_amount))); 531 emit_insn (gen_lshrdi3 (d2, d2, 532 gen_lowpart (SImode, shift_amount))); 533 } 534 else 535 { 536 emit_insn (gen_ashlsi3 (shift_amount, shift_amount, 537 GEN_INT (LOG2_BITS_PER_UNIT))); 538 emit_insn (gen_lshrsi3 (d1, d1, shift_amount)); 539 emit_insn (gen_lshrsi3 (d2, d2, shift_amount)); 540 } 541 542 if (TARGET_P9_MISC) 543 { 544 /* Generate a compare, and convert with a setb later. */ 545 rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1, d2); 546 emit_insn (gen_rtx_SET (dcond, cmp)); 547 } 548 else 549 { 550 if (word_mode == DImode) 551 emit_insn (gen_subfdi3_carry (diff, d2, d1)); 552 else 553 emit_insn (gen_subfsi3_carry (diff, d2, d1)); 554 } 555 } 556 557 /* Generate rtl for an overlapping load and compare of less than a 558 full load_mode. This assumes that the previous word is part of the 559 block being compared so it's ok to back up part of a word so we can 560 compare the last unaligned full word that ends at the end of the block. 561 562 LOAD_MODE is the machine mode for the loads. 563 ISCONST tells whether the remaining length is a constant or in a register. 564 BYTES_REM is the remaining length if ISCONST is true. 565 DIFF is the reg for the difference. 566 CMP_REM is the reg containing the remaining bytes to compare if !ISCONST. 567 DCOND is the CCUNS reg for the compare if we are doing P9 code with setb. 568 SRC1_ADDR is the first source address. 569 SRC2_ADDR is the second source address. 570 ORIG_SRC1 is the original first source block's address rtx. 571 ORIG_SRC2 is the original second source block's address rtx. */ 572 static void 573 do_overlap_load_compare (machine_mode load_mode, bool isConst, 574 HOST_WIDE_INT bytes_rem, rtx diff, 575 rtx cmp_rem, rtx dcond, rtx src1_addr, rtx src2_addr, 576 rtx orig_src1, rtx orig_src2) 577 { 578 HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode); 579 HOST_WIDE_INT addr_adj = load_mode_size - bytes_rem; 580 rtx d1 = gen_reg_rtx (word_mode); 581 rtx d2 = gen_reg_rtx (word_mode); 582 583 rtx addr1, addr2; 584 if (!isConst || addr_adj) 585 { 586 rtx adj_reg = gen_reg_rtx (word_mode); 587 if (isConst) 588 emit_move_insn (adj_reg, GEN_INT (-addr_adj)); 589 else 590 { 591 rtx reg_lms = gen_reg_rtx (word_mode); 592 emit_move_insn (reg_lms, GEN_INT (load_mode_size)); 593 do_sub3 (adj_reg, cmp_rem, reg_lms); 594 } 595 596 addr1 = gen_rtx_PLUS (word_mode, src1_addr, adj_reg); 597 addr2 = gen_rtx_PLUS (word_mode, src2_addr, adj_reg); 598 } 599 else 600 { 601 addr1 = src1_addr; 602 addr2 = src2_addr; 603 } 604 605 do_load_for_compare_from_addr (load_mode, d1, addr1, orig_src1); 606 do_load_for_compare_from_addr (load_mode, d2, addr2, orig_src2); 607 608 if (TARGET_P9_MISC) 609 { 610 /* Generate a compare, and convert with a setb later. */ 611 rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1, d2); 612 emit_insn (gen_rtx_SET (dcond, cmp)); 613 } 614 else 615 { 616 if (word_mode == DImode) 617 emit_insn (gen_subfdi3_carry (diff, d2, d1)); 618 else 619 emit_insn (gen_subfsi3_carry (diff, d2, d1)); 620 } 621 } 622 623 /* Generate the sequence of compares for strcmp/strncmp using vec/vsx 624 instructions. 625 626 BYTES_TO_COMPARE is the number of bytes to be compared. 627 ORIG_SRC1 is the unmodified rtx for the first string. 628 ORIG_SRC2 is the unmodified rtx for the second string. 629 S1ADDR is the register to use for the base address of the first string. 630 S2ADDR is the register to use for the base address of the second string. 631 OFF_REG is the register to use for the string offset for loads. 632 S1DATA is the register for loading the first string. 633 S2DATA is the register for loading the second string. 634 VEC_RESULT is the rtx for the vector result indicating the byte difference. 635 EQUALITY_COMPARE_REST is a flag to indicate we need to make a cleanup call 636 to strcmp/strncmp if we have equality at the end of the inline comparison. 637 P_CLEANUP_LABEL is a pointer to rtx for a label we generate if we need code 638 to clean up and generate the final comparison result. 639 FINAL_MOVE_LABEL is rtx for a label we can branch to when we can just 640 set the final result. 641 CHECKZERO indicates whether the sequence should check for zero bytes 642 for use doing strncmp, or not (for use doing memcmp). */ 643 static void 644 expand_cmp_vec_sequence (unsigned HOST_WIDE_INT bytes_to_compare, 645 rtx orig_src1, rtx orig_src2, 646 rtx s1addr, rtx s2addr, rtx off_reg, 647 rtx s1data, rtx s2data, rtx vec_result, 648 bool equality_compare_rest, rtx *p_cleanup_label, 649 rtx final_move_label, bool checkzero) 650 { 651 machine_mode load_mode; 652 unsigned int load_mode_size; 653 unsigned HOST_WIDE_INT cmp_bytes = 0; 654 unsigned HOST_WIDE_INT offset = 0; 655 rtx zero_reg = NULL; 656 657 gcc_assert (p_cleanup_label != NULL); 658 rtx cleanup_label = *p_cleanup_label; 659 660 emit_move_insn (s1addr, force_reg (Pmode, XEXP (orig_src1, 0))); 661 emit_move_insn (s2addr, force_reg (Pmode, XEXP (orig_src2, 0))); 662 663 if (checkzero && !TARGET_P9_VECTOR) 664 { 665 zero_reg = gen_reg_rtx (V16QImode); 666 emit_move_insn (zero_reg, CONST0_RTX (V16QImode)); 667 } 668 669 while (bytes_to_compare > 0) 670 { 671 /* VEC/VSX compare sequence for P8: 672 check each 16B with: 673 lxvd2x 32,28,8 674 lxvd2x 33,29,8 675 vcmpequb 2,0,1 # compare strings 676 vcmpequb 4,0,3 # compare w/ 0 677 xxlorc 37,36,34 # first FF byte is either mismatch or end of string 678 vcmpequb. 7,5,3 # reg 7 contains 0 679 bnl 6,.Lmismatch 680 681 For the P8 LE case, we use lxvd2x and compare full 16 bytes 682 but then use use vgbbd and a shift to get two bytes with the 683 information we need in the correct order. 684 685 VEC/VSX compare sequence if TARGET_P9_VECTOR: 686 lxvb16x/lxvb16x # load 16B of each string 687 vcmpnezb. # produces difference location or zero byte location 688 bne 6,.Lmismatch 689 690 Use the overlapping compare trick for the last block if it is 691 less than 16 bytes. 692 */ 693 694 load_mode = V16QImode; 695 load_mode_size = GET_MODE_SIZE (load_mode); 696 697 if (bytes_to_compare >= load_mode_size) 698 cmp_bytes = load_mode_size; 699 else 700 { 701 /* Move this load back so it doesn't go past the end. P8/P9 702 can do this efficiently. This is never called with less 703 than 16 bytes so we should always be able to do this. */ 704 unsigned int extra_bytes = load_mode_size - bytes_to_compare; 705 cmp_bytes = bytes_to_compare; 706 gcc_assert (offset > extra_bytes); 707 offset -= extra_bytes; 708 cmp_bytes = load_mode_size; 709 bytes_to_compare = cmp_bytes; 710 } 711 712 /* The offset currently used is always kept in off_reg so that the 713 cleanup code on P8 can use it to extract the differing byte. */ 714 emit_move_insn (off_reg, GEN_INT (offset)); 715 716 rtx addr1 = gen_rtx_PLUS (Pmode, s1addr, off_reg); 717 do_load_for_compare_from_addr (load_mode, s1data, addr1, orig_src1); 718 rtx addr2 = gen_rtx_PLUS (Pmode, s2addr, off_reg); 719 do_load_for_compare_from_addr (load_mode, s2data, addr2, orig_src2); 720 721 /* Cases to handle. A and B are chunks of the two strings. 722 1: Not end of comparison: 723 A != B: branch to cleanup code to compute result. 724 A == B: next block 725 2: End of the inline comparison: 726 A != B: branch to cleanup code to compute result. 727 A == B: call strcmp/strncmp 728 3: compared requested N bytes: 729 A == B: branch to result 0. 730 A != B: cleanup code to compute result. */ 731 732 unsigned HOST_WIDE_INT remain = bytes_to_compare - cmp_bytes; 733 734 if (checkzero) 735 { 736 if (TARGET_P9_VECTOR) 737 emit_insn (gen_vcmpnezb_p (vec_result, s1data, s2data)); 738 else 739 { 740 /* Emit instructions to do comparison and zero check. */ 741 rtx cmp_res = gen_reg_rtx (load_mode); 742 rtx cmp_zero = gen_reg_rtx (load_mode); 743 rtx cmp_combined = gen_reg_rtx (load_mode); 744 emit_insn (gen_altivec_eqv16qi (cmp_res, s1data, s2data)); 745 emit_insn (gen_altivec_eqv16qi (cmp_zero, s1data, zero_reg)); 746 emit_insn (gen_orcv16qi3 (vec_result, cmp_zero, cmp_res)); 747 emit_insn (gen_altivec_vcmpequb_p (cmp_combined, vec_result, zero_reg)); 748 } 749 } 750 else 751 emit_insn (gen_altivec_vcmpequb_p (vec_result, s1data, s2data)); 752 753 bool branch_to_cleanup = (remain > 0 || equality_compare_rest); 754 rtx cr6 = gen_rtx_REG (CCmode, CR6_REGNO); 755 rtx dst_label; 756 rtx cmp_rtx; 757 if (branch_to_cleanup) 758 { 759 /* Branch to cleanup code, otherwise fall through to do more 760 compares. P8 and P9 use different CR bits because on P8 761 we are looking at the result of a comparsion vs a 762 register of zeroes so the all-true condition means no 763 difference or zero was found. On P9, vcmpnezb sets a byte 764 to 0xff if there is a mismatch or zero, so the all-false 765 condition indicates we found no difference or zero. */ 766 if (!cleanup_label) 767 cleanup_label = gen_label_rtx (); 768 dst_label = cleanup_label; 769 if (TARGET_P9_VECTOR && checkzero) 770 cmp_rtx = gen_rtx_NE (VOIDmode, cr6, const0_rtx); 771 else 772 cmp_rtx = gen_rtx_GE (VOIDmode, cr6, const0_rtx); 773 } 774 else 775 { 776 /* Branch to final return or fall through to cleanup, 777 result is already set to 0. */ 778 dst_label = final_move_label; 779 if (TARGET_P9_VECTOR && checkzero) 780 cmp_rtx = gen_rtx_EQ (VOIDmode, cr6, const0_rtx); 781 else 782 cmp_rtx = gen_rtx_LT (VOIDmode, cr6, const0_rtx); 783 } 784 785 rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, dst_label); 786 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx, 787 lab_ref, pc_rtx); 788 rtx_insn *j2 = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse)); 789 add_reg_br_prob_note (j2, profile_probability::likely ()); 790 JUMP_LABEL (j2) = dst_label; 791 LABEL_NUSES (dst_label) += 1; 792 793 offset += cmp_bytes; 794 bytes_to_compare -= cmp_bytes; 795 } 796 *p_cleanup_label = cleanup_label; 797 return; 798 } 799 800 /* Generate the final sequence that identifies the differing 801 byte and generates the final result, taking into account 802 zero bytes: 803 804 P8: 805 vgbbd 0,0 806 vsldoi 0,0,0,9 807 mfvsrd 9,32 808 addi 10,9,-1 # count trailing zero bits 809 andc 9,10,9 810 popcntd 9,9 811 lbzx 10,28,9 # use that offset to load differing byte 812 lbzx 3,29,9 813 subf 3,3,10 # subtract for final result 814 815 P9: 816 vclzlsbb # counts trailing bytes with lsb=0 817 vextublx # extract differing byte 818 819 STR1 is the reg rtx for data from string 1. 820 STR2 is the reg rtx for data from string 2. 821 RESULT is the reg rtx for the comparison result. 822 S1ADDR is the register to use for the base address of the first string. 823 S2ADDR is the register to use for the base address of the second string. 824 ORIG_SRC1 is the unmodified rtx for the first string. 825 ORIG_SRC2 is the unmodified rtx for the second string. 826 OFF_REG is the register to use for the string offset for loads. 827 VEC_RESULT is the rtx for the vector result indicating the byte difference. */ 828 829 static void 830 emit_final_compare_vec (rtx str1, rtx str2, rtx result, 831 rtx s1addr, rtx s2addr, 832 rtx orig_src1, rtx orig_src2, 833 rtx off_reg, rtx vec_result) 834 { 835 836 if (TARGET_P9_VECTOR) 837 { 838 rtx diffix = gen_reg_rtx (SImode); 839 rtx chr1 = gen_reg_rtx (SImode); 840 rtx chr2 = gen_reg_rtx (SImode); 841 rtx chr1_di = simplify_gen_subreg (DImode, chr1, SImode, 0); 842 rtx chr2_di = simplify_gen_subreg (DImode, chr2, SImode, 0); 843 emit_insn (gen_vclzlsbb_v16qi (diffix, vec_result)); 844 emit_insn (gen_vextublx (chr1, diffix, str1)); 845 emit_insn (gen_vextublx (chr2, diffix, str2)); 846 do_sub3 (result, chr1_di, chr2_di); 847 } 848 else 849 { 850 gcc_assert (TARGET_P8_VECTOR); 851 rtx diffix = gen_reg_rtx (DImode); 852 rtx result_gbbd = gen_reg_rtx (V16QImode); 853 /* Since each byte of the input is either 00 or FF, the bytes in 854 dw0 and dw1 after vgbbd are all identical to each other. */ 855 emit_insn (gen_p8v_vgbbd (result_gbbd, vec_result)); 856 /* For LE, we shift by 9 and get BA in the low two bytes then CTZ. 857 For BE, we shift by 7 and get AB in the high two bytes then CLZ. */ 858 rtx result_shifted = gen_reg_rtx (V16QImode); 859 int shift_amt = (BYTES_BIG_ENDIAN) ? 7 : 9; 860 emit_insn (gen_altivec_vsldoi_v16qi (result_shifted, result_gbbd, 861 result_gbbd, GEN_INT (shift_amt))); 862 863 rtx diffix_df = simplify_gen_subreg (DFmode, diffix, DImode, 0); 864 emit_insn (gen_p8_mfvsrd_3_v16qi (diffix_df, result_shifted)); 865 rtx count = gen_reg_rtx (DImode); 866 867 if (BYTES_BIG_ENDIAN) 868 emit_insn (gen_clzdi2 (count, diffix)); 869 else 870 emit_insn (gen_ctzdi2 (count, diffix)); 871 872 /* P8 doesn't have a good solution for extracting one byte from 873 a vsx reg like vextublx on P9 so we just compute the offset 874 of the differing byte and load it from each string. */ 875 do_add3 (off_reg, off_reg, count); 876 877 rtx chr1 = gen_reg_rtx (QImode); 878 rtx chr2 = gen_reg_rtx (QImode); 879 rtx addr1 = gen_rtx_PLUS (Pmode, s1addr, off_reg); 880 do_load_for_compare_from_addr (QImode, chr1, addr1, orig_src1); 881 rtx addr2 = gen_rtx_PLUS (Pmode, s2addr, off_reg); 882 do_load_for_compare_from_addr (QImode, chr2, addr2, orig_src2); 883 machine_mode rmode = GET_MODE (result); 884 rtx chr1_rm = simplify_gen_subreg (rmode, chr1, QImode, 0); 885 rtx chr2_rm = simplify_gen_subreg (rmode, chr2, QImode, 0); 886 do_sub3 (result, chr1_rm, chr2_rm); 887 } 888 889 return; 890 } 891 892 /* Expand a block compare operation using loop code, and return true 893 if successful. Return false if we should let the compiler generate 894 normal code, probably a memcmp call. 895 896 OPERANDS[0] is the target (result). 897 OPERANDS[1] is the first source. 898 OPERANDS[2] is the second source. 899 OPERANDS[3] is the length. 900 OPERANDS[4] is the alignment. */ 901 bool 902 expand_compare_loop (rtx operands[]) 903 { 904 rtx target = operands[0]; 905 rtx orig_src1 = operands[1]; 906 rtx orig_src2 = operands[2]; 907 rtx bytes_rtx = operands[3]; 908 rtx align_rtx = operands[4]; 909 910 /* This case is complicated to handle because the subtract 911 with carry instructions do not generate the 64-bit 912 carry and so we must emit code to calculate it ourselves. 913 We choose not to implement this yet. */ 914 if (TARGET_32BIT && TARGET_POWERPC64) 915 return false; 916 917 /* Allow non-const length. */ 918 int bytes_is_const = CONST_INT_P (bytes_rtx); 919 920 /* This must be a fixed size alignment. */ 921 if (!CONST_INT_P (align_rtx)) 922 return false; 923 924 HOST_WIDE_INT align1 = MEM_ALIGN (orig_src1) / BITS_PER_UNIT; 925 HOST_WIDE_INT align2 = MEM_ALIGN (orig_src2) / BITS_PER_UNIT; 926 HOST_WIDE_INT minalign = MIN (align1, align2); 927 928 bool isP7 = (rs6000_tune == PROCESSOR_POWER7); 929 930 gcc_assert (GET_MODE (target) == SImode); 931 932 /* Anything to move? */ 933 HOST_WIDE_INT bytes = 0; 934 if (bytes_is_const) 935 bytes = INTVAL (bytes_rtx); 936 937 if (bytes_is_const && bytes == 0) 938 return true; 939 940 /* Limit the amount we compare, if known statically. */ 941 HOST_WIDE_INT max_bytes; 942 switch (rs6000_tune) 943 { 944 case PROCESSOR_POWER7: 945 if (!bytes_is_const) 946 if (minalign < 8) 947 max_bytes = 0; 948 else 949 max_bytes = 128; 950 else 951 if (minalign < 8) 952 max_bytes = 32; 953 else 954 max_bytes = 128; 955 break; 956 case PROCESSOR_POWER8: 957 if (!bytes_is_const) 958 max_bytes = 0; 959 else 960 if (minalign < 8) 961 max_bytes = 128; 962 else 963 max_bytes = 64; 964 break; 965 case PROCESSOR_POWER9: 966 if (bytes_is_const) 967 max_bytes = 191; 968 else 969 max_bytes = 0; 970 break; 971 default: 972 max_bytes = 128; 973 } 974 975 /* Allow the option to override the default. */ 976 if (rs6000_block_compare_inline_loop_limit >= 0) 977 max_bytes = (unsigned HOST_WIDE_INT) rs6000_block_compare_inline_loop_limit; 978 979 if (max_bytes == 0) 980 return false; 981 982 rtx cmp_rem = gen_reg_rtx (word_mode); /* Remainder for library call. */ 983 rtx loop_cmp = gen_reg_rtx (word_mode); /* Actual amount compared by loop. */ 984 HOST_WIDE_INT niter; 985 rtx iter = gen_reg_rtx (word_mode); 986 rtx iv1 = gen_reg_rtx (word_mode); 987 rtx iv2 = gen_reg_rtx (word_mode); 988 rtx d1_1 = gen_reg_rtx (word_mode); /* Addr expression src1+iv1 */ 989 rtx d1_2 = gen_reg_rtx (word_mode); /* Addr expression src1+iv2 */ 990 rtx d2_1 = gen_reg_rtx (word_mode); /* Addr expression src2+iv1 */ 991 rtx d2_2 = gen_reg_rtx (word_mode); /* Addr expression src2+iv2 */ 992 993 /* Strip unneeded subreg from length if there is one. */ 994 if (SUBREG_P (bytes_rtx) && subreg_lowpart_p (bytes_rtx)) 995 bytes_rtx = SUBREG_REG (bytes_rtx); 996 /* Extend bytes_rtx to word_mode if needed. But, we expect only to 997 maybe have to deal with the case were bytes_rtx is SImode and 998 word_mode is DImode. */ 999 if (!bytes_is_const) 1000 { 1001 if (GET_MODE_SIZE (GET_MODE (bytes_rtx)) > GET_MODE_SIZE (word_mode)) 1002 /* Do not expect length longer than word_mode. */ 1003 return false; 1004 else if (GET_MODE_SIZE (GET_MODE (bytes_rtx)) < GET_MODE_SIZE (word_mode)) 1005 { 1006 bytes_rtx = force_reg (GET_MODE (bytes_rtx), bytes_rtx); 1007 bytes_rtx = force_reg (word_mode, 1008 gen_rtx_fmt_e (ZERO_EXTEND, word_mode, 1009 bytes_rtx)); 1010 } 1011 else 1012 /* Make sure it's in a register before we get started. */ 1013 bytes_rtx = force_reg (GET_MODE (bytes_rtx), bytes_rtx); 1014 } 1015 1016 machine_mode load_mode = word_mode; 1017 HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode); 1018 1019 /* Number of bytes per iteration of the unrolled loop. */ 1020 HOST_WIDE_INT loop_bytes = 2 * load_mode_size; 1021 /* max iters and bytes compared in the loop. */ 1022 HOST_WIDE_INT max_loop_iter = max_bytes / loop_bytes; 1023 HOST_WIDE_INT max_loop_bytes = max_loop_iter * loop_bytes; 1024 int l2lb = floor_log2 (loop_bytes); 1025 1026 if (bytes_is_const && (max_bytes < load_mode_size 1027 || !IN_RANGE (bytes, load_mode_size, max_bytes))) 1028 return false; 1029 1030 bool no_remainder_code = false; 1031 rtx final_label = gen_label_rtx (); 1032 rtx final_ref = gen_rtx_LABEL_REF (VOIDmode, final_label); 1033 rtx diff_label = gen_label_rtx (); 1034 rtx library_call_label = NULL; 1035 rtx cleanup_label = gen_label_rtx (); 1036 1037 rtx cr; 1038 1039 rtx src1_addr = copy_addr_to_reg (XEXP (orig_src1, 0)); 1040 rtx src2_addr = copy_addr_to_reg (XEXP (orig_src2, 0)); 1041 1042 /* Difference found is stored here before jump to diff_label. */ 1043 rtx diff = gen_reg_rtx (word_mode); 1044 rtx_insn *j; 1045 1046 /* Example of generated code for 35 bytes aligned 1 byte. 1047 1048 mtctr 8 1049 li 6,0 1050 li 5,8 1051 .L13: 1052 ldbrx 7,3,6 1053 ldbrx 9,10,6 1054 ldbrx 0,3,5 1055 ldbrx 4,10,5 1056 addi 6,6,16 1057 addi 5,5,16 1058 subfc. 9,9,7 1059 bne 0,.L10 1060 subfc. 9,4,0 1061 bdnzt 2,.L13 1062 bne 0,.L10 1063 add 3,3,6 1064 add 10,10,6 1065 addi 9,3,-5 1066 ldbrx 7,0,9 1067 addi 9,10,-5 1068 ldbrx 9,0,9 1069 subfc 9,9,7 1070 .p2align 4,,15 1071 .L10: 1072 popcntd 9,9 1073 subfe 10,10,10 1074 or 9,9,10 1075 1076 Compiled with -fno-reorder-blocks for clarity. */ 1077 1078 /* Structure of what we're going to do: 1079 Two separate lengths: what we will compare before bailing to library 1080 call (max_bytes), and the total length to be checked. 1081 if length <= 16, branch to linear cleanup code starting with 1082 remainder length check (length not known at compile time) 1083 set up 2 iv's and load count reg, compute remainder length 1084 unrollx2 compare loop 1085 if loop exit due to a difference, branch to difference handling code 1086 if remainder length < 8, branch to final cleanup compare 1087 load and compare 8B 1088 final cleanup comparison (depends on alignment and length) 1089 load 8B, shift off bytes past length, compare 1090 load 8B ending at last byte and compare 1091 load/compare 1 byte at a time (short block abutting 4k boundary) 1092 difference handling, 64->32 conversion 1093 final result 1094 branch around memcmp call 1095 memcmp library call 1096 */ 1097 1098 /* If bytes is not const, compare length and branch directly 1099 to the cleanup code that can handle 0-16 bytes if length 1100 is >= 16. Stash away bytes-max_bytes for the library call. */ 1101 if (bytes_is_const) 1102 { 1103 /* These need to be set for some of the places we may jump to. */ 1104 if (bytes > max_bytes) 1105 { 1106 no_remainder_code = true; 1107 niter = max_loop_iter; 1108 library_call_label = gen_label_rtx (); 1109 } 1110 else 1111 { 1112 niter = bytes / loop_bytes; 1113 } 1114 emit_move_insn (iter, GEN_INT (niter)); 1115 emit_move_insn (loop_cmp, GEN_INT (niter * loop_bytes)); 1116 emit_move_insn (cmp_rem, GEN_INT (bytes - niter * loop_bytes)); 1117 } 1118 else 1119 { 1120 library_call_label = gen_label_rtx (); 1121 1122 /* If we go to the cleanup code, it expects length to be in cmp_rem. */ 1123 emit_move_insn (cmp_rem, bytes_rtx); 1124 1125 /* Check for > max_bytes bytes. We want to bail out as quickly as 1126 possible if we have to go over to memcmp. */ 1127 do_ifelse (CCmode, GT, bytes_rtx, GEN_INT (max_bytes), 1128 NULL_RTX, library_call_label, profile_probability::even ()); 1129 1130 /* Check for < loop_bytes bytes. */ 1131 do_ifelse (CCmode, LT, bytes_rtx, GEN_INT (loop_bytes), 1132 NULL_RTX, cleanup_label, profile_probability::even ()); 1133 1134 /* Loop compare bytes and iterations if bytes>max_bytes. */ 1135 rtx mb_reg = gen_reg_rtx (word_mode); 1136 emit_move_insn (mb_reg, GEN_INT (max_loop_bytes)); 1137 rtx mi_reg = gen_reg_rtx (word_mode); 1138 emit_move_insn (mi_reg, GEN_INT (max_loop_iter)); 1139 1140 /* Compute number of loop iterations if bytes <= max_bytes. */ 1141 if (word_mode == DImode) 1142 emit_insn (gen_lshrdi3 (iter, bytes_rtx, GEN_INT (l2lb))); 1143 else 1144 emit_insn (gen_lshrsi3 (iter, bytes_rtx, GEN_INT (l2lb))); 1145 1146 /* Compute bytes to compare in loop if bytes <= max_bytes. */ 1147 rtx mask = GEN_INT (HOST_WIDE_INT_M1U << l2lb); 1148 if (word_mode == DImode) 1149 { 1150 emit_insn (gen_anddi3 (loop_cmp, bytes_rtx, mask)); 1151 } 1152 else 1153 { 1154 emit_insn (gen_andsi3 (loop_cmp, bytes_rtx, mask)); 1155 } 1156 1157 /* Check for bytes <= max_bytes. */ 1158 if (TARGET_ISEL) 1159 { 1160 /* P9 has fast isel so we use one compare and two isel. */ 1161 cr = gen_reg_rtx (CCmode); 1162 rtx compare_rtx = gen_rtx_COMPARE (CCmode, bytes_rtx, 1163 GEN_INT (max_bytes)); 1164 emit_move_insn (cr, compare_rtx); 1165 rtx cmp_rtx = gen_rtx_LE (VOIDmode, cr, const0_rtx); 1166 do_isel (loop_cmp, cmp_rtx, loop_cmp, mb_reg, cr); 1167 do_isel (iter, cmp_rtx, iter, mi_reg, cr); 1168 } 1169 else 1170 { 1171 rtx lab_after = gen_label_rtx (); 1172 do_ifelse (CCmode, LE, bytes_rtx, GEN_INT (max_bytes), 1173 NULL_RTX, lab_after, profile_probability::even ()); 1174 emit_move_insn (loop_cmp, mb_reg); 1175 emit_move_insn (iter, mi_reg); 1176 emit_label (lab_after); 1177 } 1178 1179 /* Now compute remainder bytes which isn't used until after the loop. */ 1180 do_sub3 (cmp_rem, bytes_rtx, loop_cmp); 1181 } 1182 1183 rtx dcond = NULL_RTX; /* Used for when we jump to diff_label. */ 1184 /* For p9 we need to have just one of these as multiple places define 1185 it and it gets used by the setb at the end. */ 1186 if (TARGET_P9_MISC) 1187 dcond = gen_reg_rtx (CCUNSmode); 1188 1189 if (!bytes_is_const || bytes >= loop_bytes) 1190 { 1191 /* It should not be possible to come here if remaining bytes is 1192 < 16 in the runtime case either. Compute number of loop 1193 iterations. We compare 2*word_mode per iteration so 16B for 1194 64-bit code and 8B for 32-bit. Set up two induction 1195 variables and load count register. */ 1196 1197 /* HACK ALERT: create hard reg for CTR here. If we just use a 1198 pseudo, cse will get rid of it and then the allocator will 1199 see it used in the lshr above and won't give us ctr. */ 1200 rtx ctr = gen_rtx_REG (Pmode, CTR_REGNO); 1201 emit_move_insn (ctr, iter); 1202 emit_move_insn (diff, GEN_INT (0)); 1203 emit_move_insn (iv1, GEN_INT (0)); 1204 emit_move_insn (iv2, GEN_INT (load_mode_size)); 1205 1206 /* inner loop to compare 2*word_mode */ 1207 rtx loop_top_label = gen_label_rtx (); 1208 emit_label (loop_top_label); 1209 1210 rtx src1_ix1 = gen_rtx_PLUS (word_mode, src1_addr, iv1); 1211 rtx src2_ix1 = gen_rtx_PLUS (word_mode, src2_addr, iv1); 1212 1213 do_load_for_compare_from_addr (load_mode, d1_1, 1214 src1_ix1, orig_src1); 1215 do_load_for_compare_from_addr (load_mode, d2_1, 1216 src2_ix1, orig_src2); 1217 do_add3 (iv1, iv1, GEN_INT (loop_bytes)); 1218 1219 rtx src1_ix2 = gen_rtx_PLUS (word_mode, src1_addr, iv2); 1220 rtx src2_ix2 = gen_rtx_PLUS (word_mode, src2_addr, iv2); 1221 1222 do_load_for_compare_from_addr (load_mode, d1_2, 1223 src1_ix2, orig_src1); 1224 do_load_for_compare_from_addr (load_mode, d2_2, 1225 src2_ix2, orig_src2); 1226 do_add3 (iv2, iv2, GEN_INT (loop_bytes)); 1227 1228 if (TARGET_P9_MISC) 1229 { 1230 /* Generate a compare, and convert with a setb later. */ 1231 rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1_1, d2_1); 1232 emit_insn (gen_rtx_SET (dcond, cmp)); 1233 } 1234 else 1235 { 1236 dcond = gen_reg_rtx (CCmode); 1237 if (word_mode == DImode) 1238 emit_insn (gen_subfdi3_carry_dot2 (diff, d2_1, d1_1, dcond)); 1239 else 1240 emit_insn (gen_subfsi3_carry_dot2 (diff, d2_1, d1_1, dcond)); 1241 } 1242 1243 do_ifelse (GET_MODE (dcond), NE, NULL_RTX, NULL_RTX, 1244 dcond, diff_label, profile_probability::unlikely ()); 1245 1246 if (TARGET_P9_MISC) 1247 { 1248 /* Generate a compare, and convert with a setb later. */ 1249 rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1_2, d2_2); 1250 emit_insn (gen_rtx_SET (dcond, cmp)); 1251 } 1252 else 1253 { 1254 dcond = gen_reg_rtx (CCmode); 1255 if (word_mode == DImode) 1256 emit_insn (gen_subfdi3_carry_dot2 (diff, d2_2, d1_2, dcond)); 1257 else 1258 emit_insn (gen_subfsi3_carry_dot2 (diff, d2_2, d1_2, dcond)); 1259 } 1260 1261 rtx eqrtx = gen_rtx_EQ (VOIDmode, d1_2, d2_2); 1262 if (TARGET_64BIT) 1263 j = emit_jump_insn (gen_bdnztf_di (loop_top_label, ctr, ctr, 1264 eqrtx, dcond)); 1265 else 1266 j = emit_jump_insn (gen_bdnztf_si (loop_top_label, ctr, ctr, 1267 eqrtx, dcond)); 1268 add_reg_br_prob_note (j, profile_probability::likely ()); 1269 JUMP_LABEL (j) = loop_top_label; 1270 LABEL_NUSES (loop_top_label) += 1; 1271 } 1272 1273 HOST_WIDE_INT bytes_remaining = 0; 1274 if (bytes_is_const) 1275 bytes_remaining = (bytes % loop_bytes); 1276 1277 /* If diff is nonzero, branch to difference handling 1278 code. If we exit here with a nonzero diff, it is 1279 because the second word differed. */ 1280 if (TARGET_P9_MISC) 1281 do_ifelse (CCUNSmode, NE, NULL_RTX, NULL_RTX, dcond, 1282 diff_label, profile_probability::unlikely ()); 1283 else 1284 do_ifelse (CCmode, NE, diff, const0_rtx, NULL_RTX, 1285 diff_label, profile_probability::unlikely ()); 1286 1287 if (library_call_label != NULL && bytes_is_const && bytes > max_bytes) 1288 { 1289 /* If the length is known at compile time, then we will always 1290 have a remainder to go to the library call with. */ 1291 rtx library_call_ref = gen_rtx_LABEL_REF (VOIDmode, library_call_label); 1292 j = emit_jump_insn (gen_rtx_SET (pc_rtx, library_call_ref)); 1293 JUMP_LABEL (j) = library_call_label; 1294 LABEL_NUSES (library_call_label) += 1; 1295 emit_barrier (); 1296 } 1297 1298 if (bytes_is_const && bytes_remaining == 0) 1299 { 1300 /* No remainder and if we are here then diff is 0 so just return 0 */ 1301 if (TARGET_64BIT) 1302 emit_insn (gen_movsi (target, gen_lowpart (SImode, diff))); 1303 else 1304 emit_move_insn (target, diff); 1305 j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref)); 1306 JUMP_LABEL (j) = final_label; 1307 LABEL_NUSES (final_label) += 1; 1308 emit_barrier (); 1309 } 1310 else if (!no_remainder_code) 1311 { 1312 /* Update addresses to point to the next word to examine. */ 1313 do_add3 (src1_addr, src1_addr, iv1); 1314 do_add3 (src2_addr, src2_addr, iv1); 1315 1316 emit_label (cleanup_label); 1317 1318 if (!bytes_is_const) 1319 { 1320 /* If we're dealing with runtime length, we have to check if 1321 it's zero after the loop. When length is known at compile 1322 time the no-remainder condition is dealt with above. By 1323 doing this after cleanup_label, we also deal with the 1324 case where length is 0 at the start and we bypass the 1325 loop with a branch to cleanup_label. */ 1326 emit_move_insn (target, const0_rtx); 1327 do_ifelse (CCmode, EQ, cmp_rem, const0_rtx, 1328 NULL_RTX, final_label, profile_probability::unlikely ()); 1329 } 1330 1331 rtx final_cleanup = gen_label_rtx (); 1332 rtx cmp_rem_before = gen_reg_rtx (word_mode); 1333 /* Compare one more word_mode chunk if needed. */ 1334 if (!bytes_is_const || bytes_remaining >= load_mode_size) 1335 { 1336 /* If remainder length < word length, branch to final 1337 cleanup compare. */ 1338 1339 if (!bytes_is_const) 1340 { 1341 do_ifelse (CCmode, LT, cmp_rem, GEN_INT (load_mode_size), 1342 NULL_RTX, final_cleanup, profile_probability::even ()); 1343 } 1344 1345 /* load and compare 8B */ 1346 do_load_for_compare_from_addr (load_mode, d1_1, 1347 src1_addr, orig_src1); 1348 do_load_for_compare_from_addr (load_mode, d2_1, 1349 src2_addr, orig_src2); 1350 1351 /* Compare the word, see if we need to do the last partial. */ 1352 if (TARGET_P9_MISC) 1353 { 1354 /* Generate a compare, and convert with a setb later. */ 1355 rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1_1, d2_1); 1356 emit_insn (gen_rtx_SET (dcond, cmp)); 1357 } 1358 else 1359 { 1360 dcond = gen_reg_rtx (CCmode); 1361 if (word_mode == DImode) 1362 emit_insn (gen_subfdi3_carry_dot2 (diff, d2_1, d1_1, dcond)); 1363 else 1364 emit_insn (gen_subfsi3_carry_dot2 (diff, d2_1, d1_1, dcond)); 1365 } 1366 1367 do_ifelse (GET_MODE (dcond), NE, NULL_RTX, NULL_RTX, 1368 dcond, diff_label, profile_probability::even ()); 1369 1370 do_add3 (src1_addr, src1_addr, GEN_INT (load_mode_size)); 1371 do_add3 (src2_addr, src2_addr, GEN_INT (load_mode_size)); 1372 emit_move_insn (cmp_rem_before, cmp_rem); 1373 do_add3 (cmp_rem, cmp_rem, GEN_INT (-load_mode_size)); 1374 if (bytes_is_const) 1375 bytes_remaining -= load_mode_size; 1376 else 1377 /* See if remaining length is now zero. We previously set 1378 target to 0 so we can just jump to the end. */ 1379 do_ifelse (CCmode, EQ, cmp_rem, const0_rtx, NULL_RTX, 1380 final_label, profile_probability::unlikely ()); 1381 } 1382 1383 /* Cases: 1384 bytes_is_const 1385 We can always shift back to do an overlapping compare 1386 of the last chunk because we know length >= 8. 1387 1388 !bytes_is_const 1389 align>=load_mode_size 1390 Read word_mode and mask 1391 align<load_mode_size 1392 avoid stepping past end 1393 1394 Three strategies: 1395 * decrement address and do overlapping compare 1396 * read word_mode and mask 1397 * carefully avoid crossing 4k boundary 1398 */ 1399 1400 if ((!bytes_is_const || (bytes_is_const && bytes_remaining && isP7)) 1401 && align1 >= load_mode_size && align2 >= load_mode_size) 1402 { 1403 /* Alignment is larger than word_mode so we do not need to be 1404 concerned with extra page crossings. But, we do not know 1405 that the length is larger than load_mode_size so we might 1406 end up compareing against data before the block if we try 1407 an overlapping compare. Also we use this on P7 for fixed length 1408 remainder because P7 doesn't like overlapping unaligned. 1409 Strategy: load 8B, shift off bytes past length, and compare. */ 1410 emit_label (final_cleanup); 1411 do_load_mask_compare (load_mode, diff, cmp_rem, dcond, 1412 src1_addr, src2_addr, orig_src1, orig_src2); 1413 } 1414 else if (bytes_remaining && bytes_is_const) 1415 { 1416 /* We do not do loop expand if length < 32 so we know at the 1417 end we can do an overlapping compare. 1418 Strategy: shift address back and do word_mode load that 1419 ends at the end of the block. */ 1420 emit_label (final_cleanup); 1421 do_overlap_load_compare (load_mode, true, bytes_remaining, diff, 1422 cmp_rem, dcond, src1_addr, src2_addr, 1423 orig_src1, orig_src2); 1424 } 1425 else if (!bytes_is_const) 1426 { 1427 rtx handle4k_label = gen_label_rtx (); 1428 rtx nonconst_overlap = gen_label_rtx (); 1429 emit_label (nonconst_overlap); 1430 1431 /* Here we have to handle the case where whe have runtime 1432 length which may be too short for overlap compare, and 1433 alignment is not at least load_mode_size so we have to 1434 tread carefully to avoid stepping across 4k boundaries. */ 1435 1436 /* If the length after the loop was larger than word_mode 1437 size, we can just do an overlapping compare and we're 1438 done. We fall through to this code from the word_mode 1439 compare that preceeds this. */ 1440 do_overlap_load_compare (load_mode, false, 0, diff, 1441 cmp_rem, dcond, src1_addr, src2_addr, 1442 orig_src1, orig_src2); 1443 1444 rtx diff_ref = gen_rtx_LABEL_REF (VOIDmode, diff_label); 1445 j = emit_jump_insn (gen_rtx_SET (pc_rtx, diff_ref)); 1446 JUMP_LABEL (j) = diff_label; 1447 LABEL_NUSES (diff_label) += 1; 1448 emit_barrier (); 1449 1450 /* If we couldn't do the overlap compare we have to be more 1451 careful of the 4k boundary. Test to see if either 1452 address is less than word_mode_size away from a 4k 1453 boundary. If not, then we can do a load/shift/compare 1454 and we are done. We come to this code if length was less 1455 than word_mode_size. */ 1456 1457 emit_label (final_cleanup); 1458 1459 /* We can still avoid the slow case if the length was larger 1460 than one loop iteration, in which case go do the overlap 1461 load compare path. */ 1462 do_ifelse (CCmode, GT, bytes_rtx, GEN_INT (loop_bytes), 1463 NULL_RTX, nonconst_overlap, profile_probability::even ()); 1464 1465 rtx rem4k = gen_reg_rtx (word_mode); 1466 rtx dist1 = gen_reg_rtx (word_mode); 1467 rtx dist2 = gen_reg_rtx (word_mode); 1468 do_sub3 (rem4k, GEN_INT (4096), cmp_rem); 1469 if (word_mode == SImode) 1470 emit_insn (gen_andsi3 (dist1, src1_addr, GEN_INT (0xfff))); 1471 else 1472 emit_insn (gen_anddi3 (dist1, src1_addr, GEN_INT (0xfff))); 1473 do_ifelse (CCmode, LE, dist1, rem4k, NULL_RTX, 1474 handle4k_label, profile_probability::very_unlikely ()); 1475 if (word_mode == SImode) 1476 emit_insn (gen_andsi3 (dist2, src2_addr, GEN_INT (0xfff))); 1477 else 1478 emit_insn (gen_anddi3 (dist2, src2_addr, GEN_INT (0xfff))); 1479 do_ifelse (CCmode, LE, dist2, rem4k, NULL_RTX, 1480 handle4k_label, profile_probability::very_unlikely ()); 1481 1482 /* We don't have a 4k boundary to deal with, so do 1483 a load/shift/compare and jump to diff. */ 1484 1485 do_load_mask_compare (load_mode, diff, cmp_rem, dcond, 1486 src1_addr, src2_addr, orig_src1, orig_src2); 1487 1488 j = emit_jump_insn (gen_rtx_SET (pc_rtx, diff_ref)); 1489 JUMP_LABEL (j) = diff_label; 1490 LABEL_NUSES (diff_label) += 1; 1491 emit_barrier (); 1492 1493 /* Finally in the unlikely case we are inching up to a 1494 4k boundary we use a compact lbzx/compare loop to do 1495 it a byte at a time. */ 1496 1497 emit_label (handle4k_label); 1498 1499 rtx ctr = gen_rtx_REG (Pmode, CTR_REGNO); 1500 emit_move_insn (ctr, cmp_rem); 1501 rtx ixreg = gen_reg_rtx (Pmode); 1502 emit_move_insn (ixreg, const0_rtx); 1503 1504 rtx src1_ix = gen_rtx_PLUS (word_mode, src1_addr, ixreg); 1505 rtx src2_ix = gen_rtx_PLUS (word_mode, src2_addr, ixreg); 1506 rtx d1 = gen_reg_rtx (word_mode); 1507 rtx d2 = gen_reg_rtx (word_mode); 1508 1509 rtx fc_loop = gen_label_rtx (); 1510 emit_label (fc_loop); 1511 1512 do_load_for_compare_from_addr (QImode, d1, src1_ix, orig_src1); 1513 do_load_for_compare_from_addr (QImode, d2, src2_ix, orig_src2); 1514 1515 do_add3 (ixreg, ixreg, const1_rtx); 1516 1517 rtx cond = gen_reg_rtx (CCmode); 1518 rtx subexpr = gen_rtx_MINUS (word_mode, d1, d2); 1519 rs6000_emit_dot_insn (diff, subexpr, 2, cond); 1520 1521 rtx eqrtx = gen_rtx_EQ (VOIDmode, d1, d2); 1522 if (TARGET_64BIT) 1523 j = emit_jump_insn (gen_bdnztf_di (fc_loop, ctr, ctr, 1524 eqrtx, cond)); 1525 else 1526 j = emit_jump_insn (gen_bdnztf_si (fc_loop, ctr, ctr, 1527 eqrtx, cond)); 1528 add_reg_br_prob_note (j, profile_probability::likely ()); 1529 JUMP_LABEL (j) = fc_loop; 1530 LABEL_NUSES (fc_loop) += 1; 1531 1532 if (TARGET_64BIT) 1533 emit_insn (gen_movsi (target, gen_lowpart (SImode, diff))); 1534 else 1535 emit_move_insn (target, diff); 1536 1537 /* Since we are comparing bytes, the difference can be used 1538 as the final result and we are done here. */ 1539 j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref)); 1540 JUMP_LABEL (j) = final_label; 1541 LABEL_NUSES (final_label) += 1; 1542 emit_barrier (); 1543 } 1544 } 1545 1546 emit_label (diff_label); 1547 /* difference handling, 64->32 conversion */ 1548 1549 /* We need to produce DI result from sub, then convert to target SI 1550 while maintaining <0 / ==0 / >0 properties. This sequence works: 1551 subfc L,A,B 1552 subfe H,H,H 1553 popcntd L,L 1554 rldimi L,H,6,0 1555 1556 This is an alternate one Segher cooked up if somebody 1557 wants to expand this for something that doesn't have popcntd: 1558 subfc L,a,b 1559 subfe H,x,x 1560 addic t,L,-1 1561 subfe v,t,L 1562 or z,v,H 1563 1564 And finally, p9 can just do this: 1565 cmpld A,B 1566 setb r */ 1567 1568 if (TARGET_P9_MISC) 1569 emit_insn (gen_setb_unsigned (target, dcond)); 1570 else 1571 { 1572 if (TARGET_64BIT) 1573 { 1574 rtx tmp_reg_ca = gen_reg_rtx (DImode); 1575 emit_insn (gen_subfdi3_carry_in_xx (tmp_reg_ca)); 1576 emit_insn (gen_popcntddi2 (diff, diff)); 1577 emit_insn (gen_iordi3 (diff, diff, tmp_reg_ca)); 1578 emit_insn (gen_movsi (target, gen_lowpart (SImode, diff))); 1579 } 1580 else 1581 { 1582 rtx tmp_reg_ca = gen_reg_rtx (SImode); 1583 emit_insn (gen_subfsi3_carry_in_xx (tmp_reg_ca)); 1584 emit_insn (gen_popcntdsi2 (diff, diff)); 1585 emit_insn (gen_iorsi3 (target, diff, tmp_reg_ca)); 1586 } 1587 } 1588 1589 if (library_call_label != NULL) 1590 { 1591 /* Branch around memcmp call. */ 1592 j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref)); 1593 JUMP_LABEL (j) = final_label; 1594 LABEL_NUSES (final_label) += 1; 1595 emit_barrier (); 1596 1597 /* Make memcmp library call. cmp_rem is the remaining bytes that 1598 were compared and cmp_rem is the expected amount to be compared 1599 by memcmp. If we don't find a difference in the loop compare, do 1600 the library call directly instead of doing a small compare just 1601 to get to an arbitrary boundary before calling it anyway. 1602 Also, update addresses to point to the next word to examine. */ 1603 emit_label (library_call_label); 1604 1605 rtx len_rtx = gen_reg_rtx (word_mode); 1606 if (bytes_is_const) 1607 { 1608 emit_move_insn (len_rtx, cmp_rem); 1609 do_add3 (src1_addr, src1_addr, iv1); 1610 do_add3 (src2_addr, src2_addr, iv1); 1611 } 1612 else 1613 emit_move_insn (len_rtx, bytes_rtx); 1614 1615 tree fun = builtin_decl_explicit (BUILT_IN_MEMCMP); 1616 emit_library_call_value (XEXP (DECL_RTL (fun), 0), 1617 target, LCT_NORMAL, GET_MODE (target), 1618 src1_addr, Pmode, 1619 src2_addr, Pmode, 1620 len_rtx, GET_MODE (len_rtx)); 1621 } 1622 1623 /* emit final_label */ 1624 emit_label (final_label); 1625 return true; 1626 } 1627 1628 /* Generate code to convert a DImode-plus-carry subtract result into 1629 a SImode result that has the same <0 / ==0 / >0 properties to 1630 produce the final result from memcmp. 1631 1632 TARGET is the rtx for the register to receive the memcmp result. 1633 SUB_RESULT is the rtx for the register contining the subtract result. */ 1634 1635 void 1636 generate_6432_conversion(rtx target, rtx sub_result) 1637 { 1638 /* We need to produce DI result from sub, then convert to target SI 1639 while maintaining <0 / ==0 / >0 properties. This sequence works: 1640 subfc L,A,B 1641 subfe H,H,H 1642 popcntd L,L 1643 rldimi L,H,6,0 1644 1645 This is an alternate one Segher cooked up if somebody 1646 wants to expand this for something that doesn't have popcntd: 1647 subfc L,a,b 1648 subfe H,x,x 1649 addic t,L,-1 1650 subfe v,t,L 1651 or z,v,H 1652 1653 And finally, p9 can just do this: 1654 cmpld A,B 1655 setb r */ 1656 1657 if (TARGET_64BIT) 1658 { 1659 rtx tmp_reg_ca = gen_reg_rtx (DImode); 1660 emit_insn (gen_subfdi3_carry_in_xx (tmp_reg_ca)); 1661 rtx popcnt = gen_reg_rtx (DImode); 1662 emit_insn (gen_popcntddi2 (popcnt, sub_result)); 1663 rtx tmp2 = gen_reg_rtx (DImode); 1664 emit_insn (gen_iordi3 (tmp2, popcnt, tmp_reg_ca)); 1665 emit_insn (gen_movsi (target, gen_lowpart (SImode, tmp2))); 1666 } 1667 else 1668 { 1669 rtx tmp_reg_ca = gen_reg_rtx (SImode); 1670 emit_insn (gen_subfsi3_carry_in_xx (tmp_reg_ca)); 1671 rtx popcnt = gen_reg_rtx (SImode); 1672 emit_insn (gen_popcntdsi2 (popcnt, sub_result)); 1673 emit_insn (gen_iorsi3 (target, popcnt, tmp_reg_ca)); 1674 } 1675 } 1676 1677 /* Generate memcmp expansion using in-line non-loop GPR instructions. 1678 The bool return indicates whether code for a 64->32 conversion 1679 should be generated. 1680 1681 BYTES is the number of bytes to be compared. 1682 BASE_ALIGN is the minimum alignment for both blocks to compare. 1683 ORIG_SRC1 is the original pointer to the first block to compare. 1684 ORIG_SRC2 is the original pointer to the second block to compare. 1685 SUB_RESULT is the reg rtx for the result from the final subtract. 1686 COND is rtx for a condition register that will be used for the final 1687 compare on power9 or better. 1688 FINAL_RESULT is the reg rtx for the final memcmp result. 1689 P_CONVERT_LABEL is a pointer to rtx that will be used to store the 1690 label generated for a branch to the 64->32 code, if such a branch 1691 is needed. 1692 P_FINAL_LABEL is a pointer to rtx that will be used to store the label 1693 for the end of the memcmp if a branch there is needed. 1694 */ 1695 1696 bool 1697 expand_block_compare_gpr(unsigned HOST_WIDE_INT bytes, unsigned int base_align, 1698 rtx orig_src1, rtx orig_src2, 1699 rtx sub_result, rtx cond, rtx final_result, 1700 rtx *p_convert_label, rtx *p_final_label) 1701 { 1702 /* Example of generated code for 18 bytes aligned 1 byte. 1703 Compiled with -fno-reorder-blocks for clarity. 1704 ldbrx 10,31,8 1705 ldbrx 9,7,8 1706 subfc. 9,9,10 1707 bne 0,.L6487 1708 addi 9,12,8 1709 addi 5,11,8 1710 ldbrx 10,0,9 1711 ldbrx 9,0,5 1712 subfc. 9,9,10 1713 bne 0,.L6487 1714 addi 9,12,16 1715 lhbrx 10,0,9 1716 addi 9,11,16 1717 lhbrx 9,0,9 1718 subf 9,9,10 1719 b .L6488 1720 .p2align 4,,15 1721 .L6487: #convert_label 1722 popcntd 9,9 1723 subfe 10,10,10 1724 or 9,9,10 1725 .L6488: #final_label 1726 extsw 10,9 1727 1728 We start off with DImode for two blocks that jump to the DI->SI conversion 1729 if the difference is found there, then a final block of HImode that skips 1730 the DI->SI conversion. */ 1731 1732 unsigned HOST_WIDE_INT offset = 0; 1733 unsigned int load_mode_size; 1734 HOST_WIDE_INT cmp_bytes = 0; 1735 rtx src1 = orig_src1; 1736 rtx src2 = orig_src2; 1737 rtx tmp_reg_src1 = gen_reg_rtx (word_mode); 1738 rtx tmp_reg_src2 = gen_reg_rtx (word_mode); 1739 bool need_6432_conv = false; 1740 rtx convert_label = NULL; 1741 rtx final_label = NULL; 1742 machine_mode load_mode; 1743 1744 while (bytes > 0) 1745 { 1746 unsigned int align = compute_current_alignment (base_align, offset); 1747 load_mode = select_block_compare_mode (offset, bytes, align); 1748 load_mode_size = GET_MODE_SIZE (load_mode); 1749 if (bytes >= load_mode_size) 1750 cmp_bytes = load_mode_size; 1751 else if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED) 1752 { 1753 /* Move this load back so it doesn't go past the end. 1754 P8/P9 can do this efficiently. */ 1755 unsigned int extra_bytes = load_mode_size - bytes; 1756 cmp_bytes = bytes; 1757 if (extra_bytes < offset) 1758 { 1759 offset -= extra_bytes; 1760 cmp_bytes = load_mode_size; 1761 bytes = cmp_bytes; 1762 } 1763 } 1764 else 1765 /* P7 and earlier can't do the overlapping load trick fast, 1766 so this forces a non-overlapping load and a shift to get 1767 rid of the extra bytes. */ 1768 cmp_bytes = bytes; 1769 1770 src1 = adjust_address (orig_src1, load_mode, offset); 1771 src2 = adjust_address (orig_src2, load_mode, offset); 1772 1773 if (!REG_P (XEXP (src1, 0))) 1774 { 1775 rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0)); 1776 src1 = replace_equiv_address (src1, src1_reg); 1777 } 1778 set_mem_size (src1, load_mode_size); 1779 1780 if (!REG_P (XEXP (src2, 0))) 1781 { 1782 rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0)); 1783 src2 = replace_equiv_address (src2, src2_reg); 1784 } 1785 set_mem_size (src2, load_mode_size); 1786 1787 do_load_for_compare (tmp_reg_src1, src1, load_mode); 1788 do_load_for_compare (tmp_reg_src2, src2, load_mode); 1789 1790 if (cmp_bytes < load_mode_size) 1791 { 1792 /* Shift unneeded bytes off. */ 1793 rtx sh = GEN_INT (BITS_PER_UNIT * (load_mode_size - cmp_bytes)); 1794 if (word_mode == DImode) 1795 { 1796 emit_insn (gen_lshrdi3 (tmp_reg_src1, tmp_reg_src1, sh)); 1797 emit_insn (gen_lshrdi3 (tmp_reg_src2, tmp_reg_src2, sh)); 1798 } 1799 else 1800 { 1801 emit_insn (gen_lshrsi3 (tmp_reg_src1, tmp_reg_src1, sh)); 1802 emit_insn (gen_lshrsi3 (tmp_reg_src2, tmp_reg_src2, sh)); 1803 } 1804 } 1805 1806 int remain = bytes - cmp_bytes; 1807 if (GET_MODE_SIZE (GET_MODE (final_result)) > GET_MODE_SIZE (load_mode)) 1808 { 1809 /* Final_result is larger than load size so we don't need to 1810 reduce result size. */ 1811 1812 /* We previously did a block that need 64->32 conversion but 1813 the current block does not, so a label is needed to jump 1814 to the end. */ 1815 if (need_6432_conv && !final_label) 1816 final_label = gen_label_rtx (); 1817 1818 if (remain > 0) 1819 { 1820 /* This is not the last block, branch to the end if the result 1821 of this subtract is not zero. */ 1822 if (!final_label) 1823 final_label = gen_label_rtx (); 1824 rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label); 1825 rtx tmp = gen_rtx_MINUS (word_mode, tmp_reg_src1, tmp_reg_src2); 1826 rtx cr = gen_reg_rtx (CCmode); 1827 rs6000_emit_dot_insn (tmp_reg_src2, tmp, 2, cr); 1828 emit_insn (gen_movsi (final_result, 1829 gen_lowpart (SImode, tmp_reg_src2))); 1830 rtx ne_rtx = gen_rtx_NE (VOIDmode, cr, const0_rtx); 1831 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, ne_rtx, 1832 fin_ref, pc_rtx); 1833 rtx_insn *j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse)); 1834 add_reg_br_prob_note (j, profile_probability::unlikely ()); 1835 JUMP_LABEL (j) = final_label; 1836 LABEL_NUSES (final_label) += 1; 1837 } 1838 else 1839 { 1840 if (word_mode == DImode) 1841 { 1842 emit_insn (gen_subdi3 (tmp_reg_src2, tmp_reg_src1, 1843 tmp_reg_src2)); 1844 emit_insn (gen_movsi (final_result, 1845 gen_lowpart (SImode, tmp_reg_src2))); 1846 } 1847 else 1848 emit_insn (gen_subsi3 (final_result, tmp_reg_src1, tmp_reg_src2)); 1849 1850 if (final_label) 1851 { 1852 rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label); 1853 rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref)); 1854 JUMP_LABEL (j) = final_label; 1855 LABEL_NUSES (final_label) += 1; 1856 emit_barrier (); 1857 } 1858 } 1859 } 1860 else 1861 { 1862 /* Do we need a 64->32 conversion block? We need the 64->32 1863 conversion even if final_result size == load_mode size because 1864 the subtract generates one extra bit. */ 1865 need_6432_conv = true; 1866 1867 if (remain > 0) 1868 { 1869 if (!convert_label) 1870 convert_label = gen_label_rtx (); 1871 1872 /* Compare to zero and branch to convert_label if not zero. */ 1873 rtx cvt_ref = gen_rtx_LABEL_REF (VOIDmode, convert_label); 1874 if (TARGET_P9_MISC) 1875 { 1876 /* Generate a compare, and convert with a setb later. 1877 Use cond that is passed in because the caller needs 1878 to use it for the 64->32 conversion later. */ 1879 rtx cmp = gen_rtx_COMPARE (CCUNSmode, tmp_reg_src1, 1880 tmp_reg_src2); 1881 emit_insn (gen_rtx_SET (cond, cmp)); 1882 } 1883 else 1884 { 1885 /* Generate a subfc. and use the longer sequence for 1886 conversion. Cond is not used outside this 1887 function in this case. */ 1888 cond = gen_reg_rtx (CCmode); 1889 if (TARGET_64BIT) 1890 emit_insn (gen_subfdi3_carry_dot2 (sub_result, tmp_reg_src2, 1891 tmp_reg_src1, cond)); 1892 else 1893 emit_insn (gen_subfsi3_carry_dot2 (sub_result, tmp_reg_src2, 1894 tmp_reg_src1, cond)); 1895 } 1896 1897 rtx ne_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx); 1898 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, ne_rtx, 1899 cvt_ref, pc_rtx); 1900 rtx_insn *j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse)); 1901 add_reg_br_prob_note (j, profile_probability::likely ()); 1902 JUMP_LABEL (j) = convert_label; 1903 LABEL_NUSES (convert_label) += 1; 1904 } 1905 else 1906 { 1907 /* Just do the subtract/compare. Since this is the last block 1908 the convert code will be generated immediately following. */ 1909 if (TARGET_P9_MISC) 1910 { 1911 rtx cmp = gen_rtx_COMPARE (CCUNSmode, tmp_reg_src1, 1912 tmp_reg_src2); 1913 emit_insn (gen_rtx_SET (cond, cmp)); 1914 } 1915 else 1916 if (TARGET_64BIT) 1917 emit_insn (gen_subfdi3_carry (sub_result, tmp_reg_src2, 1918 tmp_reg_src1)); 1919 else 1920 emit_insn (gen_subfsi3_carry (sub_result, tmp_reg_src2, 1921 tmp_reg_src1)); 1922 } 1923 } 1924 1925 offset += cmp_bytes; 1926 bytes -= cmp_bytes; 1927 } 1928 1929 if (convert_label) 1930 *p_convert_label = convert_label; 1931 if (final_label) 1932 *p_final_label = final_label; 1933 return need_6432_conv; 1934 } 1935 1936 /* Expand a block compare operation, and return true if successful. 1937 Return false if we should let the compiler generate normal code, 1938 probably a memcmp call. 1939 1940 OPERANDS[0] is the target (result). 1941 OPERANDS[1] is the first source. 1942 OPERANDS[2] is the second source. 1943 OPERANDS[3] is the length. 1944 OPERANDS[4] is the alignment. */ 1945 bool 1946 expand_block_compare (rtx operands[]) 1947 { 1948 rtx target = operands[0]; 1949 rtx orig_src1 = operands[1]; 1950 rtx orig_src2 = operands[2]; 1951 rtx bytes_rtx = operands[3]; 1952 rtx align_rtx = operands[4]; 1953 1954 /* This case is complicated to handle because the subtract 1955 with carry instructions do not generate the 64-bit 1956 carry and so we must emit code to calculate it ourselves. 1957 We choose not to implement this yet. */ 1958 if (TARGET_32BIT && TARGET_POWERPC64) 1959 return false; 1960 1961 bool isP7 = (rs6000_tune == PROCESSOR_POWER7); 1962 1963 /* Allow this param to shut off all expansion. */ 1964 if (rs6000_block_compare_inline_limit == 0) 1965 return false; 1966 1967 /* targetm.slow_unaligned_access -- don't do unaligned stuff. 1968 However slow_unaligned_access returns true on P7 even though the 1969 performance of this code is good there. */ 1970 if (!isP7 1971 && (targetm.slow_unaligned_access (word_mode, MEM_ALIGN (orig_src1)) 1972 || targetm.slow_unaligned_access (word_mode, MEM_ALIGN (orig_src2)))) 1973 return false; 1974 1975 /* Unaligned l*brx traps on P7 so don't do this. However this should 1976 not affect much because LE isn't really supported on P7 anyway. */ 1977 if (isP7 && !BYTES_BIG_ENDIAN) 1978 return false; 1979 1980 /* If this is not a fixed size compare, try generating loop code and 1981 if that fails just call memcmp. */ 1982 if (!CONST_INT_P (bytes_rtx)) 1983 return expand_compare_loop (operands); 1984 1985 /* This must be a fixed size alignment. */ 1986 if (!CONST_INT_P (align_rtx)) 1987 return false; 1988 1989 unsigned int base_align = UINTVAL (align_rtx) / BITS_PER_UNIT; 1990 1991 gcc_assert (GET_MODE (target) == SImode); 1992 1993 /* Anything to move? */ 1994 unsigned HOST_WIDE_INT bytes = UINTVAL (bytes_rtx); 1995 if (bytes == 0) 1996 return true; 1997 1998 /* P7/P8 code uses cond for subfc. but P9 uses 1999 it for cmpld which needs CCUNSmode. */ 2000 rtx cond = NULL; 2001 if (TARGET_P9_MISC) 2002 cond = gen_reg_rtx (CCUNSmode); 2003 2004 /* Is it OK to use vec/vsx for this. TARGET_VSX means we have at 2005 least POWER7 but we use TARGET_EFFICIENT_UNALIGNED_VSX which is 2006 at least POWER8. That way we can rely on overlapping compares to 2007 do the final comparison of less than 16 bytes. Also I do not 2008 want to deal with making this work for 32 bits. In addition, we 2009 have to make sure that we have at least P8_VECTOR (we don't allow 2010 P9_VECTOR without P8_VECTOR). */ 2011 int use_vec = (bytes >= 33 && !TARGET_32BIT 2012 && TARGET_EFFICIENT_UNALIGNED_VSX && TARGET_P8_VECTOR); 2013 2014 /* We don't want to generate too much code. The loop code can take 2015 over for lengths greater than 31 bytes. */ 2016 unsigned HOST_WIDE_INT max_bytes = rs6000_block_compare_inline_limit; 2017 2018 /* Don't generate too much code if vsx was disabled. */ 2019 if (!use_vec && max_bytes > 1) 2020 max_bytes = ((max_bytes + 1) / 2) - 1; 2021 2022 if (!IN_RANGE (bytes, 1, max_bytes)) 2023 return expand_compare_loop (operands); 2024 2025 /* The code generated for p7 and older is not faster than glibc 2026 memcmp if alignment is small and length is not short, so bail 2027 out to avoid those conditions. */ 2028 if (!TARGET_EFFICIENT_OVERLAPPING_UNALIGNED 2029 && ((base_align == 1 && bytes > 16) 2030 || (base_align == 2 && bytes > 32))) 2031 return false; 2032 2033 rtx final_label = NULL; 2034 2035 if (use_vec) 2036 { 2037 rtx final_move_label = gen_label_rtx (); 2038 rtx s1addr = gen_reg_rtx (Pmode); 2039 rtx s2addr = gen_reg_rtx (Pmode); 2040 rtx off_reg = gen_reg_rtx (Pmode); 2041 rtx cleanup_label = NULL; 2042 rtx vec_result = gen_reg_rtx (V16QImode); 2043 rtx s1data = gen_reg_rtx (V16QImode); 2044 rtx s2data = gen_reg_rtx (V16QImode); 2045 rtx result_reg = gen_reg_rtx (word_mode); 2046 emit_move_insn (result_reg, GEN_INT (0)); 2047 2048 expand_cmp_vec_sequence (bytes, orig_src1, orig_src2, 2049 s1addr, s2addr, off_reg, s1data, s2data, 2050 vec_result, false, 2051 &cleanup_label, final_move_label, false); 2052 2053 if (cleanup_label) 2054 emit_label (cleanup_label); 2055 2056 emit_insn (gen_one_cmplv16qi2 (vec_result, vec_result)); 2057 2058 emit_final_compare_vec (s1data, s2data, result_reg, 2059 s1addr, s2addr, orig_src1, orig_src2, 2060 off_reg, vec_result); 2061 2062 emit_label (final_move_label); 2063 emit_insn (gen_movsi (target, 2064 gen_lowpart (SImode, result_reg))); 2065 } 2066 else 2067 { /* generate GPR code */ 2068 2069 rtx convert_label = NULL; 2070 rtx sub_result = gen_reg_rtx (word_mode); 2071 bool need_6432_conversion = 2072 expand_block_compare_gpr(bytes, base_align, 2073 orig_src1, orig_src2, 2074 sub_result, cond, target, 2075 &convert_label, &final_label); 2076 2077 if (need_6432_conversion) 2078 { 2079 if (convert_label) 2080 emit_label (convert_label); 2081 if (TARGET_P9_MISC) 2082 emit_insn (gen_setb_unsigned (target, cond)); 2083 else 2084 generate_6432_conversion(target, sub_result); 2085 } 2086 } 2087 2088 if (final_label) 2089 emit_label (final_label); 2090 2091 return true; 2092 } 2093 2094 /* Generate page crossing check and branch code to set up for 2095 strncmp when we don't have DI alignment. 2096 STRNCMP_LABEL is the label to branch if there is a page crossing. 2097 SRC_ADDR is the string address to be examined. 2098 BYTES is the max number of bytes to compare. */ 2099 static void 2100 expand_strncmp_align_check (rtx strncmp_label, rtx src_addr, HOST_WIDE_INT bytes) 2101 { 2102 rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, strncmp_label); 2103 rtx src_pgoff = gen_reg_rtx (GET_MODE (src_addr)); 2104 do_and3 (src_pgoff, src_addr, GEN_INT (0xfff)); 2105 rtx cond = gen_reg_rtx (CCmode); 2106 emit_move_insn (cond, gen_rtx_COMPARE (CCmode, src_pgoff, 2107 GEN_INT (4096 - bytes))); 2108 2109 rtx cmp_rtx = gen_rtx_GE (VOIDmode, cond, const0_rtx); 2110 2111 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx, 2112 lab_ref, pc_rtx); 2113 rtx_insn *j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse)); 2114 add_reg_br_prob_note (j, profile_probability::unlikely ()); 2115 JUMP_LABEL (j) = strncmp_label; 2116 LABEL_NUSES (strncmp_label) += 1; 2117 } 2118 2119 /* Generate the sequence of compares for strcmp/strncmp using gpr instructions. 2120 BYTES_TO_COMPARE is the number of bytes to be compared. 2121 BASE_ALIGN is the smaller of the alignment of the two strings. 2122 ORIG_SRC1 is the unmodified rtx for the first string. 2123 ORIG_SRC2 is the unmodified rtx for the second string. 2124 TMP_REG_SRC1 is the register for loading the first string. 2125 TMP_REG_SRC2 is the register for loading the second string. 2126 RESULT_REG is the rtx for the result register. 2127 EQUALITY_COMPARE_REST is a flag to indicate we need to make a cleanup call 2128 to strcmp/strncmp if we have equality at the end of the inline comparison. 2129 P_CLEANUP_LABEL is a pointer to rtx for a label we generate if we need code 2130 to clean up and generate the final comparison result. 2131 FINAL_MOVE_LABEL is rtx for a label we can branch to when we can just 2132 set the final result. */ 2133 static void 2134 expand_strncmp_gpr_sequence (unsigned HOST_WIDE_INT bytes_to_compare, 2135 unsigned int base_align, 2136 rtx orig_src1, rtx orig_src2, 2137 rtx tmp_reg_src1, rtx tmp_reg_src2, rtx result_reg, 2138 bool equality_compare_rest, rtx *p_cleanup_label, 2139 rtx final_move_label) 2140 { 2141 unsigned int word_mode_size = GET_MODE_SIZE (word_mode); 2142 machine_mode load_mode; 2143 unsigned int load_mode_size; 2144 unsigned HOST_WIDE_INT cmp_bytes = 0; 2145 unsigned HOST_WIDE_INT offset = 0; 2146 rtx src1_addr = force_reg (Pmode, XEXP (orig_src1, 0)); 2147 rtx src2_addr = force_reg (Pmode, XEXP (orig_src2, 0)); 2148 gcc_assert (p_cleanup_label != NULL); 2149 rtx cleanup_label = *p_cleanup_label; 2150 2151 while (bytes_to_compare > 0) 2152 { 2153 /* GPR compare sequence: 2154 check each 8B with: ld/ld/cmpb/cmpb/orc./bne 2155 2156 cleanup code at end: 2157 cntlzd get bit of first zero/diff byte 2158 subfic convert for rldcl use 2159 rldcl rldcl extract diff/zero byte 2160 subf subtract for final result 2161 2162 The last compare can branch around the cleanup code if the 2163 result is zero because the strings are exactly equal. */ 2164 2165 unsigned int align = compute_current_alignment (base_align, offset); 2166 load_mode = select_block_compare_mode (offset, bytes_to_compare, align); 2167 load_mode_size = GET_MODE_SIZE (load_mode); 2168 if (bytes_to_compare >= load_mode_size) 2169 cmp_bytes = load_mode_size; 2170 else if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED) 2171 { 2172 /* Move this load back so it doesn't go past the end. 2173 P8/P9 can do this efficiently. */ 2174 unsigned int extra_bytes = load_mode_size - bytes_to_compare; 2175 cmp_bytes = bytes_to_compare; 2176 if (extra_bytes < offset) 2177 { 2178 offset -= extra_bytes; 2179 cmp_bytes = load_mode_size; 2180 bytes_to_compare = cmp_bytes; 2181 } 2182 } 2183 else 2184 /* P7 and earlier can't do the overlapping load trick fast, 2185 so this forces a non-overlapping load and a shift to get 2186 rid of the extra bytes. */ 2187 cmp_bytes = bytes_to_compare; 2188 2189 rtx offset_rtx; 2190 if (BYTES_BIG_ENDIAN || TARGET_AVOID_XFORM) 2191 offset_rtx = GEN_INT (offset); 2192 else 2193 { 2194 offset_rtx = gen_reg_rtx (Pmode); 2195 emit_move_insn (offset_rtx, GEN_INT (offset)); 2196 } 2197 rtx addr1 = gen_rtx_PLUS (Pmode, src1_addr, offset_rtx); 2198 rtx addr2 = gen_rtx_PLUS (Pmode, src2_addr, offset_rtx); 2199 2200 do_load_for_compare_from_addr (load_mode, tmp_reg_src1, addr1, orig_src1); 2201 do_load_for_compare_from_addr (load_mode, tmp_reg_src2, addr2, orig_src2); 2202 2203 /* We must always left-align the data we read, and 2204 clear any bytes to the right that are beyond the string. 2205 Otherwise the cmpb sequence won't produce the correct 2206 results. However if there is only one byte left, we 2207 can just subtract to get the final result so the shifts 2208 and clears are not needed. */ 2209 2210 unsigned HOST_WIDE_INT remain = bytes_to_compare - cmp_bytes; 2211 2212 /* Loading just a single byte is a special case. If we are 2213 loading more than that, we have to check whether we are 2214 looking at the entire chunk of data. If not, rotate left and 2215 clear right so that bytes we aren't supposed to look at are 2216 zeroed, and the first byte we are supposed to compare is 2217 leftmost. */ 2218 if (load_mode_size != 1) 2219 { 2220 if (load_mode_size < word_mode_size) 2221 { 2222 /* Rotate left first. */ 2223 rtx sh = GEN_INT (BITS_PER_UNIT 2224 * (word_mode_size - load_mode_size)); 2225 do_rotl3 (tmp_reg_src1, tmp_reg_src1, sh); 2226 do_rotl3 (tmp_reg_src2, tmp_reg_src2, sh); 2227 } 2228 2229 if (cmp_bytes < word_mode_size) 2230 { 2231 /* Now clear right. This plus the rotate can be 2232 turned into a rldicr instruction. */ 2233 HOST_WIDE_INT mb = BITS_PER_UNIT * (word_mode_size - cmp_bytes); 2234 rtx mask = GEN_INT (HOST_WIDE_INT_M1U << mb); 2235 do_and3 (tmp_reg_src1, tmp_reg_src1, mask); 2236 do_and3 (tmp_reg_src2, tmp_reg_src2, mask); 2237 } 2238 } 2239 2240 /* Cases to handle. A and B are chunks of the two strings. 2241 1: Not end of comparison: 2242 A != B: branch to cleanup code to compute result. 2243 A == B: check for 0 byte, next block if not found. 2244 2: End of the inline comparison: 2245 A != B: branch to cleanup code to compute result. 2246 A == B: check for 0 byte, call strcmp/strncmp 2247 3: compared requested N bytes: 2248 A == B: branch to result 0. 2249 A != B: cleanup code to compute result. */ 2250 2251 rtx dst_label; 2252 if (remain > 0 || equality_compare_rest) 2253 { 2254 /* Branch to cleanup code, otherwise fall through to do 2255 more compares. */ 2256 if (!cleanup_label) 2257 cleanup_label = gen_label_rtx (); 2258 dst_label = cleanup_label; 2259 } 2260 else 2261 /* Branch to end and produce result of 0. */ 2262 dst_label = final_move_label; 2263 2264 if (load_mode_size == 1) 2265 { 2266 /* Special case for comparing just single byte. */ 2267 if (equality_compare_rest) 2268 { 2269 /* Use subf./bne to branch to final_move_label if the 2270 byte differs, otherwise fall through to the strncmp 2271 call. We must also check for a zero byte here as we 2272 must not make the library call if this is the end of 2273 the string. */ 2274 2275 rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, final_move_label); 2276 rtx cond = gen_reg_rtx (CCmode); 2277 rtx diff_rtx = gen_rtx_MINUS (word_mode, 2278 tmp_reg_src1, tmp_reg_src2); 2279 rs6000_emit_dot_insn (result_reg, diff_rtx, 2, cond); 2280 rtx cmp_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx); 2281 2282 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx, 2283 lab_ref, pc_rtx); 2284 rtx_insn *j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse)); 2285 add_reg_br_prob_note (j, profile_probability::unlikely ()); 2286 JUMP_LABEL (j) = final_move_label; 2287 LABEL_NUSES (final_move_label) += 1; 2288 2289 /* Check for zero byte here before fall through to 2290 library call. This catches the case where the 2291 strings are equal and end in a zero byte at this 2292 position. */ 2293 2294 rtx cond0 = gen_reg_rtx (CCmode); 2295 emit_move_insn (cond0, gen_rtx_COMPARE (CCmode, tmp_reg_src1, 2296 const0_rtx)); 2297 2298 rtx cmp0eq_rtx = gen_rtx_EQ (VOIDmode, cond0, const0_rtx); 2299 2300 rtx ifelse0 = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp0eq_rtx, 2301 lab_ref, pc_rtx); 2302 rtx_insn *j0 = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse0)); 2303 add_reg_br_prob_note (j0, profile_probability::unlikely ()); 2304 JUMP_LABEL (j0) = final_move_label; 2305 LABEL_NUSES (final_move_label) += 1; 2306 } 2307 else 2308 { 2309 /* This is the last byte to be compared so we can use 2310 subf to compute the final result and branch 2311 unconditionally to final_move_label. */ 2312 2313 do_sub3 (result_reg, tmp_reg_src1, tmp_reg_src2); 2314 2315 rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_move_label); 2316 rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref)); 2317 JUMP_LABEL (j) = final_move_label; 2318 LABEL_NUSES (final_move_label) += 1; 2319 emit_barrier (); 2320 } 2321 } 2322 else 2323 { 2324 rtx cmpb_zero = gen_reg_rtx (word_mode); 2325 rtx cmpb_diff = gen_reg_rtx (word_mode); 2326 rtx zero_reg = gen_reg_rtx (word_mode); 2327 rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, dst_label); 2328 rtx cond = gen_reg_rtx (CCmode); 2329 2330 emit_move_insn (zero_reg, GEN_INT (0)); 2331 do_cmpb3 (cmpb_diff, tmp_reg_src1, tmp_reg_src2); 2332 do_cmpb3 (cmpb_zero, tmp_reg_src1, zero_reg); 2333 rtx not_diff = gen_rtx_NOT (word_mode, cmpb_diff); 2334 rtx orc_rtx = gen_rtx_IOR (word_mode, not_diff, cmpb_zero); 2335 2336 rs6000_emit_dot_insn (result_reg, orc_rtx, 2, cond); 2337 2338 rtx cmp_rtx; 2339 if (remain == 0 && !equality_compare_rest) 2340 cmp_rtx = gen_rtx_EQ (VOIDmode, cond, const0_rtx); 2341 else 2342 cmp_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx); 2343 2344 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx, 2345 lab_ref, pc_rtx); 2346 rtx_insn *j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse)); 2347 add_reg_br_prob_note (j, profile_probability::unlikely ()); 2348 JUMP_LABEL (j) = dst_label; 2349 LABEL_NUSES (dst_label) += 1; 2350 } 2351 2352 offset += cmp_bytes; 2353 bytes_to_compare -= cmp_bytes; 2354 } 2355 2356 *p_cleanup_label = cleanup_label; 2357 return; 2358 } 2359 2360 /* Generate the final sequence that identifies the differing 2361 byte and generates the final result, taking into account 2362 zero bytes: 2363 2364 cntlzd get bit of first zero/diff byte 2365 addi convert for rldcl use 2366 rldcl rldcl extract diff/zero byte 2367 subf subtract for final result 2368 2369 STR1 is the reg rtx for data from string 1. 2370 STR2 is the reg rtx for data from string 2. 2371 RESULT is the reg rtx for the comparison result. */ 2372 2373 static void 2374 emit_final_str_compare_gpr (rtx str1, rtx str2, rtx result) 2375 { 2376 machine_mode m = GET_MODE (str1); 2377 rtx rot_amt = gen_reg_rtx (m); 2378 2379 rtx rot1_1 = gen_reg_rtx (m); 2380 rtx rot1_2 = gen_reg_rtx (m); 2381 rtx rot2_1 = gen_reg_rtx (m); 2382 rtx rot2_2 = gen_reg_rtx (m); 2383 2384 if (m == SImode) 2385 { 2386 emit_insn (gen_clzsi2 (rot_amt, result)); 2387 emit_insn (gen_addsi3 (rot_amt, rot_amt, GEN_INT (8))); 2388 emit_insn (gen_rotlsi3 (rot1_1, str1, 2389 gen_lowpart (SImode, rot_amt))); 2390 emit_insn (gen_andsi3_mask (rot1_2, rot1_1, GEN_INT (0xff))); 2391 emit_insn (gen_rotlsi3 (rot2_1, str2, 2392 gen_lowpart (SImode, rot_amt))); 2393 emit_insn (gen_andsi3_mask (rot2_2, rot2_1, GEN_INT (0xff))); 2394 emit_insn (gen_subsi3 (result, rot1_2, rot2_2)); 2395 } 2396 else if (m == DImode) 2397 { 2398 emit_insn (gen_clzdi2 (rot_amt, result)); 2399 emit_insn (gen_adddi3 (rot_amt, rot_amt, GEN_INT (8))); 2400 emit_insn (gen_rotldi3 (rot1_1, str1, 2401 gen_lowpart (SImode, rot_amt))); 2402 emit_insn (gen_anddi3_mask (rot1_2, rot1_1, GEN_INT (0xff))); 2403 emit_insn (gen_rotldi3 (rot2_1, str2, 2404 gen_lowpart (SImode, rot_amt))); 2405 emit_insn (gen_anddi3_mask (rot2_2, rot2_1, GEN_INT (0xff))); 2406 emit_insn (gen_subdi3 (result, rot1_2, rot2_2)); 2407 } 2408 else 2409 gcc_unreachable (); 2410 2411 return; 2412 } 2413 2414 /* Expand a string compare operation with length, and return 2415 true if successful. Return false if we should let the 2416 compiler generate normal code, probably a strncmp call. 2417 2418 OPERANDS[0] is the target (result). 2419 OPERANDS[1] is the first source. 2420 OPERANDS[2] is the second source. 2421 If NO_LENGTH is zero, then: 2422 OPERANDS[3] is the length. 2423 OPERANDS[4] is the alignment in bytes. 2424 If NO_LENGTH is nonzero, then: 2425 OPERANDS[3] is the alignment in bytes. */ 2426 bool 2427 expand_strn_compare (rtx operands[], int no_length) 2428 { 2429 rtx target = operands[0]; 2430 rtx orig_src1 = operands[1]; 2431 rtx orig_src2 = operands[2]; 2432 rtx bytes_rtx, align_rtx; 2433 if (no_length) 2434 { 2435 bytes_rtx = NULL; 2436 align_rtx = operands[3]; 2437 } 2438 else 2439 { 2440 bytes_rtx = operands[3]; 2441 align_rtx = operands[4]; 2442 } 2443 2444 rtx src1_addr = force_reg (Pmode, XEXP (orig_src1, 0)); 2445 rtx src2_addr = force_reg (Pmode, XEXP (orig_src2, 0)); 2446 2447 /* If we have a length, it must be constant. This simplifies things 2448 a bit as we don't have to generate code to check if we've exceeded 2449 the length. Later this could be expanded to handle this case. */ 2450 if (!no_length && !CONST_INT_P (bytes_rtx)) 2451 return false; 2452 2453 /* This must be a fixed size alignment. */ 2454 if (!CONST_INT_P (align_rtx)) 2455 return false; 2456 2457 unsigned int base_align = UINTVAL (align_rtx); 2458 unsigned int align1 = MEM_ALIGN (orig_src1) / BITS_PER_UNIT; 2459 unsigned int align2 = MEM_ALIGN (orig_src2) / BITS_PER_UNIT; 2460 2461 /* targetm.slow_unaligned_access -- don't do unaligned stuff. */ 2462 if (targetm.slow_unaligned_access (word_mode, align1) 2463 || targetm.slow_unaligned_access (word_mode, align2)) 2464 return false; 2465 2466 gcc_assert (GET_MODE (target) == SImode); 2467 2468 unsigned int required_align = 8; 2469 2470 unsigned HOST_WIDE_INT offset = 0; 2471 unsigned HOST_WIDE_INT bytes; /* N from the strncmp args if available. */ 2472 unsigned HOST_WIDE_INT compare_length; /* How much to compare inline. */ 2473 2474 if (no_length) 2475 bytes = rs6000_string_compare_inline_limit; 2476 else 2477 bytes = UINTVAL (bytes_rtx); 2478 2479 /* Is it OK to use vec/vsx for this. TARGET_VSX means we have at 2480 least POWER7 but we use TARGET_EFFICIENT_UNALIGNED_VSX which is 2481 at least POWER8. That way we can rely on overlapping compares to 2482 do the final comparison of less than 16 bytes. Also I do not 2483 want to deal with making this work for 32 bits. In addition, we 2484 have to make sure that we have at least P8_VECTOR (we don't allow 2485 P9_VECTOR without P8_VECTOR). */ 2486 int use_vec = (bytes >= 16 && !TARGET_32BIT 2487 && TARGET_EFFICIENT_UNALIGNED_VSX && TARGET_P8_VECTOR); 2488 2489 if (use_vec) 2490 required_align = 16; 2491 2492 machine_mode load_mode; 2493 rtx tmp_reg_src1, tmp_reg_src2; 2494 if (use_vec) 2495 { 2496 load_mode = V16QImode; 2497 tmp_reg_src1 = gen_reg_rtx (V16QImode); 2498 tmp_reg_src2 = gen_reg_rtx (V16QImode); 2499 } 2500 else 2501 { 2502 load_mode = select_block_compare_mode (0, bytes, base_align); 2503 tmp_reg_src1 = gen_reg_rtx (word_mode); 2504 tmp_reg_src2 = gen_reg_rtx (word_mode); 2505 } 2506 2507 compare_length = rs6000_string_compare_inline_limit; 2508 2509 /* If we have equality at the end of the last compare and we have not 2510 found the end of the string, we need to call strcmp/strncmp to 2511 compare the remainder. */ 2512 bool equality_compare_rest = false; 2513 2514 if (no_length) 2515 { 2516 bytes = compare_length; 2517 equality_compare_rest = true; 2518 } 2519 else 2520 { 2521 if (bytes <= compare_length) 2522 compare_length = bytes; 2523 else 2524 equality_compare_rest = true; 2525 } 2526 2527 rtx result_reg = gen_reg_rtx (word_mode); 2528 rtx final_move_label = gen_label_rtx (); 2529 rtx final_label = gen_label_rtx (); 2530 rtx begin_compare_label = NULL; 2531 2532 if (base_align < required_align) 2533 { 2534 /* Generate code that checks distance to 4k boundary for this case. */ 2535 begin_compare_label = gen_label_rtx (); 2536 rtx strncmp_label = gen_label_rtx (); 2537 rtx jmp; 2538 2539 /* Strncmp for power8 in glibc does this: 2540 rldicl r8,r3,0,52 2541 cmpldi cr7,r8,4096-16 2542 bgt cr7,L(pagecross) */ 2543 2544 /* Make sure that the length we use for the alignment test and 2545 the subsequent code generation are in agreement so we do not 2546 go past the length we tested for a 4k boundary crossing. */ 2547 unsigned HOST_WIDE_INT align_test = compare_length; 2548 if (align_test < required_align) 2549 { 2550 align_test = HOST_WIDE_INT_1U << ceil_log2 (align_test); 2551 base_align = align_test; 2552 } 2553 else 2554 { 2555 align_test = ROUND_UP (align_test, required_align); 2556 base_align = required_align; 2557 } 2558 2559 if (align1 < required_align) 2560 expand_strncmp_align_check (strncmp_label, src1_addr, align_test); 2561 if (align2 < required_align) 2562 expand_strncmp_align_check (strncmp_label, src2_addr, align_test); 2563 2564 /* Now generate the following sequence: 2565 - branch to begin_compare 2566 - strncmp_label 2567 - call to strncmp 2568 - branch to final_label 2569 - begin_compare_label */ 2570 2571 rtx cmp_ref = gen_rtx_LABEL_REF (VOIDmode, begin_compare_label); 2572 jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, cmp_ref)); 2573 JUMP_LABEL (jmp) = begin_compare_label; 2574 LABEL_NUSES (begin_compare_label) += 1; 2575 emit_barrier (); 2576 2577 emit_label (strncmp_label); 2578 2579 if (no_length) 2580 { 2581 tree fun = builtin_decl_explicit (BUILT_IN_STRCMP); 2582 emit_library_call_value (XEXP (DECL_RTL (fun), 0), 2583 target, LCT_NORMAL, GET_MODE (target), 2584 force_reg (Pmode, src1_addr), Pmode, 2585 force_reg (Pmode, src2_addr), Pmode); 2586 } 2587 else 2588 { 2589 /* -m32 -mpowerpc64 results in word_mode being DImode even 2590 though otherwise it is 32-bit. The length arg to strncmp 2591 is a size_t which will be the same size as pointers. */ 2592 rtx len_rtx = gen_reg_rtx (Pmode); 2593 emit_move_insn (len_rtx, gen_int_mode (bytes, Pmode)); 2594 2595 tree fun = builtin_decl_explicit (BUILT_IN_STRNCMP); 2596 emit_library_call_value (XEXP (DECL_RTL (fun), 0), 2597 target, LCT_NORMAL, GET_MODE (target), 2598 force_reg (Pmode, src1_addr), Pmode, 2599 force_reg (Pmode, src2_addr), Pmode, 2600 len_rtx, Pmode); 2601 } 2602 2603 rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label); 2604 jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref)); 2605 JUMP_LABEL (jmp) = final_label; 2606 LABEL_NUSES (final_label) += 1; 2607 emit_barrier (); 2608 emit_label (begin_compare_label); 2609 } 2610 2611 rtx cleanup_label = NULL; 2612 rtx s1addr = NULL, s2addr = NULL, off_reg = NULL, vec_result = NULL; 2613 2614 /* Generate a sequence of GPR or VEC/VSX instructions to compare out 2615 to the length specified. */ 2616 if (use_vec) 2617 { 2618 s1addr = gen_reg_rtx (Pmode); 2619 s2addr = gen_reg_rtx (Pmode); 2620 off_reg = gen_reg_rtx (Pmode); 2621 vec_result = gen_reg_rtx (load_mode); 2622 emit_move_insn (result_reg, GEN_INT (0)); 2623 expand_cmp_vec_sequence (compare_length, 2624 orig_src1, orig_src2, 2625 s1addr, s2addr, off_reg, 2626 tmp_reg_src1, tmp_reg_src2, 2627 vec_result, 2628 equality_compare_rest, 2629 &cleanup_label, final_move_label, true); 2630 } 2631 else 2632 expand_strncmp_gpr_sequence (compare_length, base_align, 2633 orig_src1, orig_src2, 2634 tmp_reg_src1, tmp_reg_src2, 2635 result_reg, 2636 equality_compare_rest, 2637 &cleanup_label, final_move_label); 2638 2639 offset = compare_length; 2640 2641 if (equality_compare_rest) 2642 { 2643 /* Update pointers past what has been compared already. */ 2644 rtx src1 = force_reg (Pmode, 2645 gen_rtx_PLUS (Pmode, src1_addr, GEN_INT (offset))); 2646 rtx src2 = force_reg (Pmode, 2647 gen_rtx_PLUS (Pmode, src2_addr, GEN_INT (offset))); 2648 2649 /* Construct call to strcmp/strncmp to compare the rest of the string. */ 2650 if (no_length) 2651 { 2652 tree fun = builtin_decl_explicit (BUILT_IN_STRCMP); 2653 emit_library_call_value (XEXP (DECL_RTL (fun), 0), 2654 target, LCT_NORMAL, GET_MODE (target), 2655 src1, Pmode, src2, Pmode); 2656 } 2657 else 2658 { 2659 rtx len_rtx = gen_reg_rtx (Pmode); 2660 emit_move_insn (len_rtx, gen_int_mode (bytes - compare_length, Pmode)); 2661 tree fun = builtin_decl_explicit (BUILT_IN_STRNCMP); 2662 emit_library_call_value (XEXP (DECL_RTL (fun), 0), 2663 target, LCT_NORMAL, GET_MODE (target), 2664 src1, Pmode, src2, Pmode, len_rtx, Pmode); 2665 } 2666 2667 rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label); 2668 rtx jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref)); 2669 JUMP_LABEL (jmp) = final_label; 2670 LABEL_NUSES (final_label) += 1; 2671 emit_barrier (); 2672 } 2673 2674 if (cleanup_label) 2675 emit_label (cleanup_label); 2676 2677 if (use_vec) 2678 emit_final_compare_vec (tmp_reg_src1, tmp_reg_src2, result_reg, 2679 s1addr, s2addr, orig_src1, orig_src2, 2680 off_reg, vec_result); 2681 else 2682 emit_final_str_compare_gpr (tmp_reg_src1, tmp_reg_src2, result_reg); 2683 2684 emit_label (final_move_label); 2685 emit_insn (gen_movsi (target, 2686 gen_lowpart (SImode, result_reg))); 2687 emit_label (final_label); 2688 return true; 2689 } 2690 2691 /* Generate loads and stores for a move of v4si mode using lvx/stvx. 2692 This uses altivec_{l,st}vx_<mode>_internal which use unspecs to 2693 keep combine from changing what instruction gets used. 2694 2695 DEST is the destination for the data. 2696 SRC is the source of the data for the move. */ 2697 2698 static rtx 2699 gen_lvx_v4si_move (rtx dest, rtx src) 2700 { 2701 gcc_assert (MEM_P (dest) ^ MEM_P (src)); 2702 gcc_assert (GET_MODE (dest) == V4SImode && GET_MODE (src) == V4SImode); 2703 2704 if (MEM_P (dest)) 2705 return gen_altivec_stvx_v4si_internal (dest, src); 2706 else 2707 return gen_altivec_lvx_v4si_internal (dest, src); 2708 } 2709 2710 /* Expand a block move operation, and return 1 if successful. Return 0 2711 if we should let the compiler generate normal code. 2712 2713 operands[0] is the destination 2714 operands[1] is the source 2715 operands[2] is the length 2716 operands[3] is the alignment */ 2717 2718 #define MAX_MOVE_REG 4 2719 2720 int 2721 expand_block_move (rtx operands[]) 2722 { 2723 rtx orig_dest = operands[0]; 2724 rtx orig_src = operands[1]; 2725 rtx bytes_rtx = operands[2]; 2726 rtx align_rtx = operands[3]; 2727 int constp = CONST_INT_P (bytes_rtx); 2728 int align; 2729 int bytes; 2730 int offset; 2731 int move_bytes; 2732 rtx stores[MAX_MOVE_REG]; 2733 int num_reg = 0; 2734 2735 /* If this is not a fixed size move, just call memcpy */ 2736 if (! constp) 2737 return 0; 2738 2739 /* This must be a fixed size alignment */ 2740 gcc_assert (CONST_INT_P (align_rtx)); 2741 align = INTVAL (align_rtx) * BITS_PER_UNIT; 2742 2743 /* Anything to move? */ 2744 bytes = INTVAL (bytes_rtx); 2745 if (bytes <= 0) 2746 return 1; 2747 2748 if (bytes > rs6000_block_move_inline_limit) 2749 return 0; 2750 2751 for (offset = 0; bytes > 0; offset += move_bytes, bytes -= move_bytes) 2752 { 2753 union { 2754 rtx (*movmemsi) (rtx, rtx, rtx, rtx); 2755 rtx (*mov) (rtx, rtx); 2756 } gen_func; 2757 machine_mode mode = BLKmode; 2758 rtx src, dest; 2759 2760 /* Altivec first, since it will be faster than a string move 2761 when it applies, and usually not significantly larger. */ 2762 if (TARGET_ALTIVEC && bytes >= 16 && align >= 128) 2763 { 2764 move_bytes = 16; 2765 mode = V4SImode; 2766 gen_func.mov = gen_lvx_v4si_move; 2767 } 2768 else if (bytes >= 8 && TARGET_POWERPC64 2769 && (align >= 64 || !STRICT_ALIGNMENT)) 2770 { 2771 move_bytes = 8; 2772 mode = DImode; 2773 gen_func.mov = gen_movdi; 2774 if (offset == 0 && align < 64) 2775 { 2776 rtx addr; 2777 2778 /* If the address form is reg+offset with offset not a 2779 multiple of four, reload into reg indirect form here 2780 rather than waiting for reload. This way we get one 2781 reload, not one per load and/or store. */ 2782 addr = XEXP (orig_dest, 0); 2783 if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM) 2784 && CONST_INT_P (XEXP (addr, 1)) 2785 && (INTVAL (XEXP (addr, 1)) & 3) != 0) 2786 { 2787 addr = copy_addr_to_reg (addr); 2788 orig_dest = replace_equiv_address (orig_dest, addr); 2789 } 2790 addr = XEXP (orig_src, 0); 2791 if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM) 2792 && CONST_INT_P (XEXP (addr, 1)) 2793 && (INTVAL (XEXP (addr, 1)) & 3) != 0) 2794 { 2795 addr = copy_addr_to_reg (addr); 2796 orig_src = replace_equiv_address (orig_src, addr); 2797 } 2798 } 2799 } 2800 else if (bytes >= 4 && (align >= 32 || !STRICT_ALIGNMENT)) 2801 { /* move 4 bytes */ 2802 move_bytes = 4; 2803 mode = SImode; 2804 gen_func.mov = gen_movsi; 2805 } 2806 else if (bytes >= 2 && (align >= 16 || !STRICT_ALIGNMENT)) 2807 { /* move 2 bytes */ 2808 move_bytes = 2; 2809 mode = HImode; 2810 gen_func.mov = gen_movhi; 2811 } 2812 else /* move 1 byte at a time */ 2813 { 2814 move_bytes = 1; 2815 mode = QImode; 2816 gen_func.mov = gen_movqi; 2817 } 2818 2819 src = adjust_address (orig_src, mode, offset); 2820 dest = adjust_address (orig_dest, mode, offset); 2821 2822 if (mode != BLKmode) 2823 { 2824 rtx tmp_reg = gen_reg_rtx (mode); 2825 2826 emit_insn ((*gen_func.mov) (tmp_reg, src)); 2827 stores[num_reg++] = (*gen_func.mov) (dest, tmp_reg); 2828 } 2829 2830 if (mode == BLKmode || num_reg >= MAX_MOVE_REG || bytes == move_bytes) 2831 { 2832 int i; 2833 for (i = 0; i < num_reg; i++) 2834 emit_insn (stores[i]); 2835 num_reg = 0; 2836 } 2837 2838 if (mode == BLKmode) 2839 { 2840 /* Move the address into scratch registers. The movmemsi 2841 patterns require zero offset. */ 2842 if (!REG_P (XEXP (src, 0))) 2843 { 2844 rtx src_reg = copy_addr_to_reg (XEXP (src, 0)); 2845 src = replace_equiv_address (src, src_reg); 2846 } 2847 set_mem_size (src, move_bytes); 2848 2849 if (!REG_P (XEXP (dest, 0))) 2850 { 2851 rtx dest_reg = copy_addr_to_reg (XEXP (dest, 0)); 2852 dest = replace_equiv_address (dest, dest_reg); 2853 } 2854 set_mem_size (dest, move_bytes); 2855 2856 emit_insn ((*gen_func.movmemsi) (dest, src, 2857 GEN_INT (move_bytes & 31), 2858 align_rtx)); 2859 } 2860 } 2861 2862 return 1; 2863 } 2864