1/* Copyright (C) 1994-2015 Free Software Foundation, Inc. 2 3This file is free software; you can redistribute it and/or modify it 4under the terms of the GNU General Public License as published by the 5Free Software Foundation; either version 3, or (at your option) any 6later version. 7 8This file is distributed in the hope that it will be useful, but 9WITHOUT ANY WARRANTY; without even the implied warranty of 10MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11General Public License for more details. 12 13Under Section 7 of GPL version 3, you are granted additional 14permissions described in the GCC Runtime Library Exception, version 153.1, as published by the Free Software Foundation. 16 17You should have received a copy of the GNU General Public License and 18a copy of the GCC Runtime Library Exception along with this program; 19see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 20<http://www.gnu.org/licenses/>. */ 21 22 23!! libgcc routines for the Renesas / SuperH SH CPUs. 24!! Contributed by Steve Chamberlain. 25!! sac@cygnus.com 26 27!! ashiftrt_r4_x, ___ashrsi3, ___ashlsi3, ___lshrsi3 routines 28!! recoded in assembly by Toshiyasu Morita 29!! tm@netcom.com 30 31#if defined(__ELF__) && defined(__linux__) 32.section .note.GNU-stack,"",%progbits 33.previous 34#endif 35 36/* SH2 optimizations for ___ashrsi3, ___ashlsi3, ___lshrsi3 and 37 ELF local label prefixes by J"orn Rennecke 38 amylaar@cygnus.com */ 39 40#include "lib1funcs.h" 41 42/* t-vxworks needs to build both PIC and non-PIC versions of libgcc, 43 so it is more convenient to define NO_FPSCR_VALUES here than to 44 define it on the command line. */ 45#if defined __vxworks && defined __PIC__ 46#define NO_FPSCR_VALUES 47#endif 48 49#if ! __SH5__ 50#ifdef L_ashiftrt 51 .global GLOBAL(ashiftrt_r4_0) 52 .global GLOBAL(ashiftrt_r4_1) 53 .global GLOBAL(ashiftrt_r4_2) 54 .global GLOBAL(ashiftrt_r4_3) 55 .global GLOBAL(ashiftrt_r4_4) 56 .global GLOBAL(ashiftrt_r4_5) 57 .global GLOBAL(ashiftrt_r4_6) 58 .global GLOBAL(ashiftrt_r4_7) 59 .global GLOBAL(ashiftrt_r4_8) 60 .global GLOBAL(ashiftrt_r4_9) 61 .global GLOBAL(ashiftrt_r4_10) 62 .global GLOBAL(ashiftrt_r4_11) 63 .global GLOBAL(ashiftrt_r4_12) 64 .global GLOBAL(ashiftrt_r4_13) 65 .global GLOBAL(ashiftrt_r4_14) 66 .global GLOBAL(ashiftrt_r4_15) 67 .global GLOBAL(ashiftrt_r4_16) 68 .global GLOBAL(ashiftrt_r4_17) 69 .global GLOBAL(ashiftrt_r4_18) 70 .global GLOBAL(ashiftrt_r4_19) 71 .global GLOBAL(ashiftrt_r4_20) 72 .global GLOBAL(ashiftrt_r4_21) 73 .global GLOBAL(ashiftrt_r4_22) 74 .global GLOBAL(ashiftrt_r4_23) 75 .global GLOBAL(ashiftrt_r4_24) 76 .global GLOBAL(ashiftrt_r4_25) 77 .global GLOBAL(ashiftrt_r4_26) 78 .global GLOBAL(ashiftrt_r4_27) 79 .global GLOBAL(ashiftrt_r4_28) 80 .global GLOBAL(ashiftrt_r4_29) 81 .global GLOBAL(ashiftrt_r4_30) 82 .global GLOBAL(ashiftrt_r4_31) 83 .global GLOBAL(ashiftrt_r4_32) 84 85 HIDDEN_FUNC(GLOBAL(ashiftrt_r4_0)) 86 HIDDEN_FUNC(GLOBAL(ashiftrt_r4_1)) 87 HIDDEN_FUNC(GLOBAL(ashiftrt_r4_2)) 88 HIDDEN_FUNC(GLOBAL(ashiftrt_r4_3)) 89 HIDDEN_FUNC(GLOBAL(ashiftrt_r4_4)) 90 HIDDEN_FUNC(GLOBAL(ashiftrt_r4_5)) 91 HIDDEN_FUNC(GLOBAL(ashiftrt_r4_6)) 92 HIDDEN_FUNC(GLOBAL(ashiftrt_r4_7)) 93 HIDDEN_FUNC(GLOBAL(ashiftrt_r4_8)) 94 HIDDEN_FUNC(GLOBAL(ashiftrt_r4_9)) 95 HIDDEN_FUNC(GLOBAL(ashiftrt_r4_10)) 96 HIDDEN_FUNC(GLOBAL(ashiftrt_r4_11)) 97 HIDDEN_FUNC(GLOBAL(ashiftrt_r4_12)) 98 HIDDEN_FUNC(GLOBAL(ashiftrt_r4_13)) 99 HIDDEN_FUNC(GLOBAL(ashiftrt_r4_14)) 100 HIDDEN_FUNC(GLOBAL(ashiftrt_r4_15)) 101 HIDDEN_FUNC(GLOBAL(ashiftrt_r4_16)) 102 HIDDEN_FUNC(GLOBAL(ashiftrt_r4_17)) 103 HIDDEN_FUNC(GLOBAL(ashiftrt_r4_18)) 104 HIDDEN_FUNC(GLOBAL(ashiftrt_r4_19)) 105 HIDDEN_FUNC(GLOBAL(ashiftrt_r4_20)) 106 HIDDEN_FUNC(GLOBAL(ashiftrt_r4_21)) 107 HIDDEN_FUNC(GLOBAL(ashiftrt_r4_22)) 108 HIDDEN_FUNC(GLOBAL(ashiftrt_r4_23)) 109 HIDDEN_FUNC(GLOBAL(ashiftrt_r4_24)) 110 HIDDEN_FUNC(GLOBAL(ashiftrt_r4_25)) 111 HIDDEN_FUNC(GLOBAL(ashiftrt_r4_26)) 112 HIDDEN_FUNC(GLOBAL(ashiftrt_r4_27)) 113 HIDDEN_FUNC(GLOBAL(ashiftrt_r4_28)) 114 HIDDEN_FUNC(GLOBAL(ashiftrt_r4_29)) 115 HIDDEN_FUNC(GLOBAL(ashiftrt_r4_30)) 116 HIDDEN_FUNC(GLOBAL(ashiftrt_r4_31)) 117 HIDDEN_FUNC(GLOBAL(ashiftrt_r4_32)) 118 119 .align 1 120GLOBAL(ashiftrt_r4_32): 121GLOBAL(ashiftrt_r4_31): 122 rotcl r4 123 rts 124 subc r4,r4 125 126GLOBAL(ashiftrt_r4_30): 127 shar r4 128GLOBAL(ashiftrt_r4_29): 129 shar r4 130GLOBAL(ashiftrt_r4_28): 131 shar r4 132GLOBAL(ashiftrt_r4_27): 133 shar r4 134GLOBAL(ashiftrt_r4_26): 135 shar r4 136GLOBAL(ashiftrt_r4_25): 137 shar r4 138GLOBAL(ashiftrt_r4_24): 139 shlr16 r4 140 shlr8 r4 141 rts 142 exts.b r4,r4 143 144GLOBAL(ashiftrt_r4_23): 145 shar r4 146GLOBAL(ashiftrt_r4_22): 147 shar r4 148GLOBAL(ashiftrt_r4_21): 149 shar r4 150GLOBAL(ashiftrt_r4_20): 151 shar r4 152GLOBAL(ashiftrt_r4_19): 153 shar r4 154GLOBAL(ashiftrt_r4_18): 155 shar r4 156GLOBAL(ashiftrt_r4_17): 157 shar r4 158GLOBAL(ashiftrt_r4_16): 159 shlr16 r4 160 rts 161 exts.w r4,r4 162 163GLOBAL(ashiftrt_r4_15): 164 shar r4 165GLOBAL(ashiftrt_r4_14): 166 shar r4 167GLOBAL(ashiftrt_r4_13): 168 shar r4 169GLOBAL(ashiftrt_r4_12): 170 shar r4 171GLOBAL(ashiftrt_r4_11): 172 shar r4 173GLOBAL(ashiftrt_r4_10): 174 shar r4 175GLOBAL(ashiftrt_r4_9): 176 shar r4 177GLOBAL(ashiftrt_r4_8): 178 shar r4 179GLOBAL(ashiftrt_r4_7): 180 shar r4 181GLOBAL(ashiftrt_r4_6): 182 shar r4 183GLOBAL(ashiftrt_r4_5): 184 shar r4 185GLOBAL(ashiftrt_r4_4): 186 shar r4 187GLOBAL(ashiftrt_r4_3): 188 shar r4 189GLOBAL(ashiftrt_r4_2): 190 shar r4 191GLOBAL(ashiftrt_r4_1): 192 rts 193 shar r4 194 195GLOBAL(ashiftrt_r4_0): 196 rts 197 nop 198 199 ENDFUNC(GLOBAL(ashiftrt_r4_0)) 200 ENDFUNC(GLOBAL(ashiftrt_r4_1)) 201 ENDFUNC(GLOBAL(ashiftrt_r4_2)) 202 ENDFUNC(GLOBAL(ashiftrt_r4_3)) 203 ENDFUNC(GLOBAL(ashiftrt_r4_4)) 204 ENDFUNC(GLOBAL(ashiftrt_r4_5)) 205 ENDFUNC(GLOBAL(ashiftrt_r4_6)) 206 ENDFUNC(GLOBAL(ashiftrt_r4_7)) 207 ENDFUNC(GLOBAL(ashiftrt_r4_8)) 208 ENDFUNC(GLOBAL(ashiftrt_r4_9)) 209 ENDFUNC(GLOBAL(ashiftrt_r4_10)) 210 ENDFUNC(GLOBAL(ashiftrt_r4_11)) 211 ENDFUNC(GLOBAL(ashiftrt_r4_12)) 212 ENDFUNC(GLOBAL(ashiftrt_r4_13)) 213 ENDFUNC(GLOBAL(ashiftrt_r4_14)) 214 ENDFUNC(GLOBAL(ashiftrt_r4_15)) 215 ENDFUNC(GLOBAL(ashiftrt_r4_16)) 216 ENDFUNC(GLOBAL(ashiftrt_r4_17)) 217 ENDFUNC(GLOBAL(ashiftrt_r4_18)) 218 ENDFUNC(GLOBAL(ashiftrt_r4_19)) 219 ENDFUNC(GLOBAL(ashiftrt_r4_20)) 220 ENDFUNC(GLOBAL(ashiftrt_r4_21)) 221 ENDFUNC(GLOBAL(ashiftrt_r4_22)) 222 ENDFUNC(GLOBAL(ashiftrt_r4_23)) 223 ENDFUNC(GLOBAL(ashiftrt_r4_24)) 224 ENDFUNC(GLOBAL(ashiftrt_r4_25)) 225 ENDFUNC(GLOBAL(ashiftrt_r4_26)) 226 ENDFUNC(GLOBAL(ashiftrt_r4_27)) 227 ENDFUNC(GLOBAL(ashiftrt_r4_28)) 228 ENDFUNC(GLOBAL(ashiftrt_r4_29)) 229 ENDFUNC(GLOBAL(ashiftrt_r4_30)) 230 ENDFUNC(GLOBAL(ashiftrt_r4_31)) 231 ENDFUNC(GLOBAL(ashiftrt_r4_32)) 232#endif 233 234#ifdef L_ashiftrt_n 235 236! 237! GLOBAL(ashrsi3) 238! 239! Entry: 240! 241! r4: Value to shift 242! r5: Shift count 243! 244! Exit: 245! 246! r0: Result 247! 248! Destroys: 249! 250! T bit, r5 251! 252 253 .global GLOBAL(ashrsi3) 254 HIDDEN_FUNC(GLOBAL(ashrsi3)) 255 .align 2 256GLOBAL(ashrsi3): 257 mov #31,r0 258 and r0,r5 259 mova LOCAL(ashrsi3_table),r0 260 mov.b @(r0,r5),r5 261#ifdef __sh1__ 262 add r5,r0 263 jmp @r0 264#else 265 braf r5 266#endif 267 mov r4,r0 268 269 .align 2 270LOCAL(ashrsi3_table): 271 .byte LOCAL(ashrsi3_0)-LOCAL(ashrsi3_table) 272 .byte LOCAL(ashrsi3_1)-LOCAL(ashrsi3_table) 273 .byte LOCAL(ashrsi3_2)-LOCAL(ashrsi3_table) 274 .byte LOCAL(ashrsi3_3)-LOCAL(ashrsi3_table) 275 .byte LOCAL(ashrsi3_4)-LOCAL(ashrsi3_table) 276 .byte LOCAL(ashrsi3_5)-LOCAL(ashrsi3_table) 277 .byte LOCAL(ashrsi3_6)-LOCAL(ashrsi3_table) 278 .byte LOCAL(ashrsi3_7)-LOCAL(ashrsi3_table) 279 .byte LOCAL(ashrsi3_8)-LOCAL(ashrsi3_table) 280 .byte LOCAL(ashrsi3_9)-LOCAL(ashrsi3_table) 281 .byte LOCAL(ashrsi3_10)-LOCAL(ashrsi3_table) 282 .byte LOCAL(ashrsi3_11)-LOCAL(ashrsi3_table) 283 .byte LOCAL(ashrsi3_12)-LOCAL(ashrsi3_table) 284 .byte LOCAL(ashrsi3_13)-LOCAL(ashrsi3_table) 285 .byte LOCAL(ashrsi3_14)-LOCAL(ashrsi3_table) 286 .byte LOCAL(ashrsi3_15)-LOCAL(ashrsi3_table) 287 .byte LOCAL(ashrsi3_16)-LOCAL(ashrsi3_table) 288 .byte LOCAL(ashrsi3_17)-LOCAL(ashrsi3_table) 289 .byte LOCAL(ashrsi3_18)-LOCAL(ashrsi3_table) 290 .byte LOCAL(ashrsi3_19)-LOCAL(ashrsi3_table) 291 .byte LOCAL(ashrsi3_20)-LOCAL(ashrsi3_table) 292 .byte LOCAL(ashrsi3_21)-LOCAL(ashrsi3_table) 293 .byte LOCAL(ashrsi3_22)-LOCAL(ashrsi3_table) 294 .byte LOCAL(ashrsi3_23)-LOCAL(ashrsi3_table) 295 .byte LOCAL(ashrsi3_24)-LOCAL(ashrsi3_table) 296 .byte LOCAL(ashrsi3_25)-LOCAL(ashrsi3_table) 297 .byte LOCAL(ashrsi3_26)-LOCAL(ashrsi3_table) 298 .byte LOCAL(ashrsi3_27)-LOCAL(ashrsi3_table) 299 .byte LOCAL(ashrsi3_28)-LOCAL(ashrsi3_table) 300 .byte LOCAL(ashrsi3_29)-LOCAL(ashrsi3_table) 301 .byte LOCAL(ashrsi3_30)-LOCAL(ashrsi3_table) 302 .byte LOCAL(ashrsi3_31)-LOCAL(ashrsi3_table) 303 304LOCAL(ashrsi3_31): 305 rotcl r0 306 rts 307 subc r0,r0 308 309LOCAL(ashrsi3_30): 310 shar r0 311LOCAL(ashrsi3_29): 312 shar r0 313LOCAL(ashrsi3_28): 314 shar r0 315LOCAL(ashrsi3_27): 316 shar r0 317LOCAL(ashrsi3_26): 318 shar r0 319LOCAL(ashrsi3_25): 320 shar r0 321LOCAL(ashrsi3_24): 322 shlr16 r0 323 shlr8 r0 324 rts 325 exts.b r0,r0 326 327LOCAL(ashrsi3_23): 328 shar r0 329LOCAL(ashrsi3_22): 330 shar r0 331LOCAL(ashrsi3_21): 332 shar r0 333LOCAL(ashrsi3_20): 334 shar r0 335LOCAL(ashrsi3_19): 336 shar r0 337LOCAL(ashrsi3_18): 338 shar r0 339LOCAL(ashrsi3_17): 340 shar r0 341LOCAL(ashrsi3_16): 342 shlr16 r0 343 rts 344 exts.w r0,r0 345 346LOCAL(ashrsi3_15): 347 shar r0 348LOCAL(ashrsi3_14): 349 shar r0 350LOCAL(ashrsi3_13): 351 shar r0 352LOCAL(ashrsi3_12): 353 shar r0 354LOCAL(ashrsi3_11): 355 shar r0 356LOCAL(ashrsi3_10): 357 shar r0 358LOCAL(ashrsi3_9): 359 shar r0 360LOCAL(ashrsi3_8): 361 shar r0 362LOCAL(ashrsi3_7): 363 shar r0 364LOCAL(ashrsi3_6): 365 shar r0 366LOCAL(ashrsi3_5): 367 shar r0 368LOCAL(ashrsi3_4): 369 shar r0 370LOCAL(ashrsi3_3): 371 shar r0 372LOCAL(ashrsi3_2): 373 shar r0 374LOCAL(ashrsi3_1): 375 rts 376 shar r0 377 378LOCAL(ashrsi3_0): 379 rts 380 nop 381 382 ENDFUNC(GLOBAL(ashrsi3)) 383#endif 384 385#ifdef L_ashiftlt 386 387! 388! GLOBAL(ashlsi3) 389! (For compatibility with older binaries, not used by compiler) 390! 391! Entry: 392! r4: Value to shift 393! r5: Shift count 394! 395! Exit: 396! r0: Result 397! 398! Destroys: 399! T bit 400! 401! 402! GLOBAL(ashlsi3_r0) 403! 404! Entry: 405! r4: Value to shift 406! r0: Shift count 407! 408! Exit: 409! r0: Result 410! 411! Destroys: 412! T bit 413 414 .global GLOBAL(ashlsi3) 415 .global GLOBAL(ashlsi3_r0) 416 HIDDEN_FUNC(GLOBAL(ashlsi3)) 417 HIDDEN_FUNC(GLOBAL(ashlsi3_r0)) 418GLOBAL(ashlsi3): 419 mov r5,r0 420 .align 2 421GLOBAL(ashlsi3_r0): 422 423#ifdef __sh1__ 424 and #31,r0 425 shll2 r0 426 mov.l r4,@-r15 427 mov r0,r4 428 mova LOCAL(ashlsi3_table),r0 429 add r4,r0 430 mov.l @r15+,r4 431 jmp @r0 432 mov r4,r0 433 .align 2 434#else 435 and #31,r0 436 shll2 r0 437 braf r0 438 mov r4,r0 439#endif 440 441LOCAL(ashlsi3_table): 442 rts // << 0 443 nop 444LOCAL(ashlsi_1): 445 rts // << 1 446 shll r0 447LOCAL(ashlsi_2): // << 2 448 rts 449 shll2 r0 450 bra LOCAL(ashlsi_1) // << 3 451 shll2 r0 452 bra LOCAL(ashlsi_2) // << 4 453 shll2 r0 454 bra LOCAL(ashlsi_5) // << 5 455 shll r0 456 bra LOCAL(ashlsi_6) // << 6 457 shll2 r0 458 bra LOCAL(ashlsi_7) // << 7 459 shll r0 460LOCAL(ashlsi_8): // << 8 461 rts 462 shll8 r0 463 bra LOCAL(ashlsi_8) // << 9 464 shll r0 465 bra LOCAL(ashlsi_8) // << 10 466 shll2 r0 467 bra LOCAL(ashlsi_11) // << 11 468 shll r0 469 bra LOCAL(ashlsi_12) // << 12 470 shll2 r0 471 bra LOCAL(ashlsi_13) // << 13 472 shll r0 473 bra LOCAL(ashlsi_14) // << 14 474 shll8 r0 475 bra LOCAL(ashlsi_15) // << 15 476 shll8 r0 477LOCAL(ashlsi_16): // << 16 478 rts 479 shll16 r0 480 bra LOCAL(ashlsi_16) // << 17 481 shll r0 482 bra LOCAL(ashlsi_16) // << 18 483 shll2 r0 484 bra LOCAL(ashlsi_19) // << 19 485 shll r0 486 bra LOCAL(ashlsi_20) // << 20 487 shll2 r0 488 bra LOCAL(ashlsi_21) // << 21 489 shll r0 490 bra LOCAL(ashlsi_22) // << 22 491 shll16 r0 492 bra LOCAL(ashlsi_23) // << 23 493 shll16 r0 494 bra LOCAL(ashlsi_16) // << 24 495 shll8 r0 496 bra LOCAL(ashlsi_25) // << 25 497 shll r0 498 bra LOCAL(ashlsi_26) // << 26 499 shll2 r0 500 bra LOCAL(ashlsi_27) // << 27 501 shll r0 502 bra LOCAL(ashlsi_28) // << 28 503 shll2 r0 504 bra LOCAL(ashlsi_29) // << 29 505 shll16 r0 506 bra LOCAL(ashlsi_30) // << 30 507 shll16 r0 508 and #1,r0 // << 31 509 rts 510 rotr r0 511 512LOCAL(ashlsi_7): 513 shll2 r0 514LOCAL(ashlsi_5): 515LOCAL(ashlsi_6): 516 shll2 r0 517 rts 518LOCAL(ashlsi_13): 519 shll2 r0 520LOCAL(ashlsi_12): 521LOCAL(ashlsi_11): 522 shll8 r0 523 rts 524LOCAL(ashlsi_21): 525 shll2 r0 526LOCAL(ashlsi_20): 527LOCAL(ashlsi_19): 528 shll16 r0 529 rts 530LOCAL(ashlsi_28): 531LOCAL(ashlsi_27): 532 shll2 r0 533LOCAL(ashlsi_26): 534LOCAL(ashlsi_25): 535 shll16 r0 536 rts 537 shll8 r0 538 539LOCAL(ashlsi_22): 540LOCAL(ashlsi_14): 541 shlr2 r0 542 rts 543 shll8 r0 544 545LOCAL(ashlsi_23): 546LOCAL(ashlsi_15): 547 shlr r0 548 rts 549 shll8 r0 550 551LOCAL(ashlsi_29): 552 shlr r0 553LOCAL(ashlsi_30): 554 shlr2 r0 555 rts 556 shll16 r0 557 558 ENDFUNC(GLOBAL(ashlsi3)) 559 ENDFUNC(GLOBAL(ashlsi3_r0)) 560#endif 561 562#ifdef L_lshiftrt 563 564! 565! GLOBAL(lshrsi3) 566! (For compatibility with older binaries, not used by compiler) 567! 568! Entry: 569! r4: Value to shift 570! r5: Shift count 571! 572! Exit: 573! r0: Result 574! 575! Destroys: 576! T bit 577! 578! 579! GLOBAL(lshrsi3_r0) 580! 581! Entry: 582! r4: Value to shift 583! r0: Shift count 584! 585! Exit: 586! r0: Result 587! 588! Destroys: 589! T bit 590 591 .global GLOBAL(lshrsi3) 592 .global GLOBAL(lshrsi3_r0) 593 HIDDEN_FUNC(GLOBAL(lshrsi3)) 594 HIDDEN_FUNC(GLOBAL(lshrsi3_r0)) 595GLOBAL(lshrsi3): 596 mov r5,r0 597 .align 2 598GLOBAL(lshrsi3_r0): 599 600#ifdef __sh1__ 601 and #31,r0 602 shll2 r0 603 mov.l r4,@-r15 604 mov r0,r4 605 mova LOCAL(lshrsi3_table),r0 606 add r4,r0 607 mov.l @r15+,r4 608 jmp @r0 609 mov r4,r0 610 .align 2 611#else 612 and #31,r0 613 shll2 r0 614 braf r0 615 mov r4,r0 616#endif 617LOCAL(lshrsi3_table): 618 rts // >> 0 619 nop 620LOCAL(lshrsi_1): // >> 1 621 rts 622 shlr r0 623LOCAL(lshrsi_2): // >> 2 624 rts 625 shlr2 r0 626 bra LOCAL(lshrsi_1) // >> 3 627 shlr2 r0 628 bra LOCAL(lshrsi_2) // >> 4 629 shlr2 r0 630 bra LOCAL(lshrsi_5) // >> 5 631 shlr r0 632 bra LOCAL(lshrsi_6) // >> 6 633 shlr2 r0 634 bra LOCAL(lshrsi_7) // >> 7 635 shlr r0 636LOCAL(lshrsi_8): // >> 8 637 rts 638 shlr8 r0 639 bra LOCAL(lshrsi_8) // >> 9 640 shlr r0 641 bra LOCAL(lshrsi_8) // >> 10 642 shlr2 r0 643 bra LOCAL(lshrsi_11) // >> 11 644 shlr r0 645 bra LOCAL(lshrsi_12) // >> 12 646 shlr2 r0 647 bra LOCAL(lshrsi_13) // >> 13 648 shlr r0 649 bra LOCAL(lshrsi_14) // >> 14 650 shlr8 r0 651 bra LOCAL(lshrsi_15) // >> 15 652 shlr8 r0 653LOCAL(lshrsi_16): // >> 16 654 rts 655 shlr16 r0 656 bra LOCAL(lshrsi_16) // >> 17 657 shlr r0 658 bra LOCAL(lshrsi_16) // >> 18 659 shlr2 r0 660 bra LOCAL(lshrsi_19) // >> 19 661 shlr r0 662 bra LOCAL(lshrsi_20) // >> 20 663 shlr2 r0 664 bra LOCAL(lshrsi_21) // >> 21 665 shlr r0 666 bra LOCAL(lshrsi_22) // >> 22 667 shlr16 r0 668 bra LOCAL(lshrsi_23) // >> 23 669 shlr16 r0 670 bra LOCAL(lshrsi_16) // >> 24 671 shlr8 r0 672 bra LOCAL(lshrsi_25) // >> 25 673 shlr r0 674 bra LOCAL(lshrsi_26) // >> 26 675 shlr2 r0 676 bra LOCAL(lshrsi_27) // >> 27 677 shlr r0 678 bra LOCAL(lshrsi_28) // >> 28 679 shlr2 r0 680 bra LOCAL(lshrsi_29) // >> 29 681 shlr16 r0 682 bra LOCAL(lshrsi_30) // >> 30 683 shlr16 r0 684 shll r0 // >> 31 685 rts 686 movt r0 687 688LOCAL(lshrsi_7): 689 shlr2 r0 690LOCAL(lshrsi_5): 691LOCAL(lshrsi_6): 692 shlr2 r0 693 rts 694LOCAL(lshrsi_13): 695 shlr2 r0 696LOCAL(lshrsi_12): 697LOCAL(lshrsi_11): 698 shlr8 r0 699 rts 700LOCAL(lshrsi_21): 701 shlr2 r0 702LOCAL(lshrsi_20): 703LOCAL(lshrsi_19): 704 shlr16 r0 705 rts 706LOCAL(lshrsi_28): 707LOCAL(lshrsi_27): 708 shlr2 r0 709LOCAL(lshrsi_26): 710LOCAL(lshrsi_25): 711 shlr16 r0 712 rts 713 shlr8 r0 714 715LOCAL(lshrsi_22): 716LOCAL(lshrsi_14): 717 shll2 r0 718 rts 719 shlr8 r0 720 721LOCAL(lshrsi_23): 722LOCAL(lshrsi_15): 723 shll r0 724 rts 725 shlr8 r0 726 727LOCAL(lshrsi_29): 728 shll r0 729LOCAL(lshrsi_30): 730 shll2 r0 731 rts 732 shlr16 r0 733 734 ENDFUNC(GLOBAL(lshrsi3)) 735 ENDFUNC(GLOBAL(lshrsi3_r0)) 736#endif 737 738#ifdef L_movmem 739 .text 740 .balign 4 741 .global GLOBAL(movmem) 742 HIDDEN_FUNC(GLOBAL(movmem)) 743 HIDDEN_ALIAS(movstr,movmem) 744 /* This would be a lot simpler if r6 contained the byte count 745 minus 64, and we wouldn't be called here for a byte count of 64. */ 746GLOBAL(movmem): 747 sts.l pr,@-r15 748 shll2 r6 749 bsr GLOBAL(movmemSI52+2) 750 mov.l @(48,r5),r0 751 .balign 4 752LOCAL(movmem_loop): /* Reached with rts */ 753 mov.l @(60,r5),r0 754 add #-64,r6 755 mov.l r0,@(60,r4) 756 tst r6,r6 757 mov.l @(56,r5),r0 758 bt LOCAL(movmem_done) 759 mov.l r0,@(56,r4) 760 cmp/pl r6 761 mov.l @(52,r5),r0 762 add #64,r5 763 mov.l r0,@(52,r4) 764 add #64,r4 765 bt GLOBAL(movmemSI52) 766! done all the large groups, do the remainder 767! jump to movmem+ 768 mova GLOBAL(movmemSI4)+4,r0 769 add r6,r0 770 jmp @r0 771LOCAL(movmem_done): ! share slot insn, works out aligned. 772 lds.l @r15+,pr 773 mov.l r0,@(56,r4) 774 mov.l @(52,r5),r0 775 rts 776 mov.l r0,@(52,r4) 777 .balign 4 778! ??? We need aliases movstr* for movmem* for the older libraries. These 779! aliases will be removed at the some point in the future. 780 .global GLOBAL(movmemSI64) 781 HIDDEN_FUNC(GLOBAL(movmemSI64)) 782 HIDDEN_ALIAS(movstrSI64,movmemSI64) 783GLOBAL(movmemSI64): 784 mov.l @(60,r5),r0 785 mov.l r0,@(60,r4) 786 .global GLOBAL(movmemSI60) 787 HIDDEN_FUNC(GLOBAL(movmemSI60)) 788 HIDDEN_ALIAS(movstrSI60,movmemSI60) 789GLOBAL(movmemSI60): 790 mov.l @(56,r5),r0 791 mov.l r0,@(56,r4) 792 .global GLOBAL(movmemSI56) 793 HIDDEN_FUNC(GLOBAL(movmemSI56)) 794 HIDDEN_ALIAS(movstrSI56,movmemSI56) 795GLOBAL(movmemSI56): 796 mov.l @(52,r5),r0 797 mov.l r0,@(52,r4) 798 .global GLOBAL(movmemSI52) 799 HIDDEN_FUNC(GLOBAL(movmemSI52)) 800 HIDDEN_ALIAS(movstrSI52,movmemSI52) 801GLOBAL(movmemSI52): 802 mov.l @(48,r5),r0 803 mov.l r0,@(48,r4) 804 .global GLOBAL(movmemSI48) 805 HIDDEN_FUNC(GLOBAL(movmemSI48)) 806 HIDDEN_ALIAS(movstrSI48,movmemSI48) 807GLOBAL(movmemSI48): 808 mov.l @(44,r5),r0 809 mov.l r0,@(44,r4) 810 .global GLOBAL(movmemSI44) 811 HIDDEN_FUNC(GLOBAL(movmemSI44)) 812 HIDDEN_ALIAS(movstrSI44,movmemSI44) 813GLOBAL(movmemSI44): 814 mov.l @(40,r5),r0 815 mov.l r0,@(40,r4) 816 .global GLOBAL(movmemSI40) 817 HIDDEN_FUNC(GLOBAL(movmemSI40)) 818 HIDDEN_ALIAS(movstrSI40,movmemSI40) 819GLOBAL(movmemSI40): 820 mov.l @(36,r5),r0 821 mov.l r0,@(36,r4) 822 .global GLOBAL(movmemSI36) 823 HIDDEN_FUNC(GLOBAL(movmemSI36)) 824 HIDDEN_ALIAS(movstrSI36,movmemSI36) 825GLOBAL(movmemSI36): 826 mov.l @(32,r5),r0 827 mov.l r0,@(32,r4) 828 .global GLOBAL(movmemSI32) 829 HIDDEN_FUNC(GLOBAL(movmemSI32)) 830 HIDDEN_ALIAS(movstrSI32,movmemSI32) 831GLOBAL(movmemSI32): 832 mov.l @(28,r5),r0 833 mov.l r0,@(28,r4) 834 .global GLOBAL(movmemSI28) 835 HIDDEN_FUNC(GLOBAL(movmemSI28)) 836 HIDDEN_ALIAS(movstrSI28,movmemSI28) 837GLOBAL(movmemSI28): 838 mov.l @(24,r5),r0 839 mov.l r0,@(24,r4) 840 .global GLOBAL(movmemSI24) 841 HIDDEN_FUNC(GLOBAL(movmemSI24)) 842 HIDDEN_ALIAS(movstrSI24,movmemSI24) 843GLOBAL(movmemSI24): 844 mov.l @(20,r5),r0 845 mov.l r0,@(20,r4) 846 .global GLOBAL(movmemSI20) 847 HIDDEN_FUNC(GLOBAL(movmemSI20)) 848 HIDDEN_ALIAS(movstrSI20,movmemSI20) 849GLOBAL(movmemSI20): 850 mov.l @(16,r5),r0 851 mov.l r0,@(16,r4) 852 .global GLOBAL(movmemSI16) 853 HIDDEN_FUNC(GLOBAL(movmemSI16)) 854 HIDDEN_ALIAS(movstrSI16,movmemSI16) 855GLOBAL(movmemSI16): 856 mov.l @(12,r5),r0 857 mov.l r0,@(12,r4) 858 .global GLOBAL(movmemSI12) 859 HIDDEN_FUNC(GLOBAL(movmemSI12)) 860 HIDDEN_ALIAS(movstrSI12,movmemSI12) 861GLOBAL(movmemSI12): 862 mov.l @(8,r5),r0 863 mov.l r0,@(8,r4) 864 .global GLOBAL(movmemSI8) 865 HIDDEN_FUNC(GLOBAL(movmemSI8)) 866 HIDDEN_ALIAS(movstrSI8,movmemSI8) 867GLOBAL(movmemSI8): 868 mov.l @(4,r5),r0 869 mov.l r0,@(4,r4) 870 .global GLOBAL(movmemSI4) 871 HIDDEN_FUNC(GLOBAL(movmemSI4)) 872 HIDDEN_ALIAS(movstrSI4,movmemSI4) 873GLOBAL(movmemSI4): 874 mov.l @(0,r5),r0 875 rts 876 mov.l r0,@(0,r4) 877 878 ENDFUNC(GLOBAL(movmemSI64)) 879 ENDFUNC(GLOBAL(movmemSI60)) 880 ENDFUNC(GLOBAL(movmemSI56)) 881 ENDFUNC(GLOBAL(movmemSI52)) 882 ENDFUNC(GLOBAL(movmemSI48)) 883 ENDFUNC(GLOBAL(movmemSI44)) 884 ENDFUNC(GLOBAL(movmemSI40)) 885 ENDFUNC(GLOBAL(movmemSI36)) 886 ENDFUNC(GLOBAL(movmemSI32)) 887 ENDFUNC(GLOBAL(movmemSI28)) 888 ENDFUNC(GLOBAL(movmemSI24)) 889 ENDFUNC(GLOBAL(movmemSI20)) 890 ENDFUNC(GLOBAL(movmemSI16)) 891 ENDFUNC(GLOBAL(movmemSI12)) 892 ENDFUNC(GLOBAL(movmemSI8)) 893 ENDFUNC(GLOBAL(movmemSI4)) 894 ENDFUNC(GLOBAL(movmem)) 895#endif 896 897#ifdef L_movmem_i4 898 .text 899 .global GLOBAL(movmem_i4_even) 900 .global GLOBAL(movmem_i4_odd) 901 .global GLOBAL(movmemSI12_i4) 902 903 HIDDEN_FUNC(GLOBAL(movmem_i4_even)) 904 HIDDEN_FUNC(GLOBAL(movmem_i4_odd)) 905 HIDDEN_FUNC(GLOBAL(movmemSI12_i4)) 906 907 HIDDEN_ALIAS(movstr_i4_even,movmem_i4_even) 908 HIDDEN_ALIAS(movstr_i4_odd,movmem_i4_odd) 909 HIDDEN_ALIAS(movstrSI12_i4,movmemSI12_i4) 910 911 .p2align 5 912L_movmem_2mod4_end: 913 mov.l r0,@(16,r4) 914 rts 915 mov.l r1,@(20,r4) 916 917 .p2align 2 918 919GLOBAL(movmem_i4_even): 920 mov.l @r5+,r0 921 bra L_movmem_start_even 922 mov.l @r5+,r1 923 924GLOBAL(movmem_i4_odd): 925 mov.l @r5+,r1 926 add #-4,r4 927 mov.l @r5+,r2 928 mov.l @r5+,r3 929 mov.l r1,@(4,r4) 930 mov.l r2,@(8,r4) 931 932L_movmem_loop: 933 mov.l r3,@(12,r4) 934 dt r6 935 mov.l @r5+,r0 936 bt/s L_movmem_2mod4_end 937 mov.l @r5+,r1 938 add #16,r4 939L_movmem_start_even: 940 mov.l @r5+,r2 941 mov.l @r5+,r3 942 mov.l r0,@r4 943 dt r6 944 mov.l r1,@(4,r4) 945 bf/s L_movmem_loop 946 mov.l r2,@(8,r4) 947 rts 948 mov.l r3,@(12,r4) 949 950 ENDFUNC(GLOBAL(movmem_i4_even)) 951 ENDFUNC(GLOBAL(movmem_i4_odd)) 952 953 .p2align 4 954GLOBAL(movmemSI12_i4): 955 mov.l @r5,r0 956 mov.l @(4,r5),r1 957 mov.l @(8,r5),r2 958 mov.l r0,@r4 959 mov.l r1,@(4,r4) 960 rts 961 mov.l r2,@(8,r4) 962 963 ENDFUNC(GLOBAL(movmemSI12_i4)) 964#endif 965 966#ifdef L_mulsi3 967 968 969 .global GLOBAL(mulsi3) 970 HIDDEN_FUNC(GLOBAL(mulsi3)) 971 972! r4 = aabb 973! r5 = ccdd 974! r0 = aabb*ccdd via partial products 975! 976! if aa == 0 and cc = 0 977! r0 = bb*dd 978! 979! else 980! aa = bb*dd + (aa*dd*65536) + (cc*bb*65536) 981! 982 983GLOBAL(mulsi3): 984 mulu.w r4,r5 ! multiply the lsws macl=bb*dd 985 mov r5,r3 ! r3 = ccdd 986 swap.w r4,r2 ! r2 = bbaa 987 xtrct r2,r3 ! r3 = aacc 988 tst r3,r3 ! msws zero ? 989 bf hiset 990 rts ! yes - then we have the answer 991 sts macl,r0 992 993hiset: sts macl,r0 ! r0 = bb*dd 994 mulu.w r2,r5 ! brewing macl = aa*dd 995 sts macl,r1 996 mulu.w r3,r4 ! brewing macl = cc*bb 997 sts macl,r2 998 add r1,r2 999 shll16 r2 1000 rts 1001 add r2,r0 1002 1003 ENDFUNC(GLOBAL(mulsi3)) 1004#endif 1005#endif /* ! __SH5__ */ 1006 1007/*------------------------------------------------------------------------------ 1008 32 bit signed integer division that uses FPU double precision division. */ 1009 1010#ifdef L_sdivsi3_i4 1011 .title "SH DIVIDE" 1012 1013#if defined (__SH4__) || defined (__SH2A__) 1014/* This variant is used when FPSCR.PR = 1 (double precision) is the default 1015 setting. 1016 Args in r4 and r5, result in fpul, clobber dr0, dr2. */ 1017 1018 .global GLOBAL(sdivsi3_i4) 1019 HIDDEN_FUNC(GLOBAL(sdivsi3_i4)) 1020GLOBAL(sdivsi3_i4): 1021 lds r4,fpul 1022 float fpul,dr0 1023 lds r5,fpul 1024 float fpul,dr2 1025 fdiv dr2,dr0 1026 rts 1027 ftrc dr0,fpul 1028 1029 ENDFUNC(GLOBAL(sdivsi3_i4)) 1030 1031#elif defined (__SH2A_SINGLE__) || defined (__SH2A_SINGLE_ONLY__) || defined(__SH4_SINGLE__) || defined(__SH4_SINGLE_ONLY__) || (defined (__SH5__) && ! defined __SH4_NOFPU__) 1032/* This variant is used when FPSCR.PR = 0 (sigle precision) is the default 1033 setting. 1034 Args in r4 and r5, result in fpul, clobber r2, dr0, dr2. 1035 For this to work, we must temporarily switch the FPU do double precision, 1036 but we better do not touch FPSCR.FR. See PR 6526. */ 1037 1038#if ! __SH5__ || __SH5__ == 32 1039#if __SH5__ 1040 .mode SHcompact 1041#endif 1042 .global GLOBAL(sdivsi3_i4) 1043 HIDDEN_FUNC(GLOBAL(sdivsi3_i4)) 1044GLOBAL(sdivsi3_i4): 1045 1046#ifndef __SH4A__ 1047 mov.l r3,@-r15 1048 sts fpscr,r2 1049 mov #8,r3 1050 swap.w r3,r3 // r3 = 1 << 19 (FPSCR.PR bit) 1051 or r2,r3 1052 lds r3,fpscr // Set FPSCR.PR = 1. 1053 lds r4,fpul 1054 float fpul,dr0 1055 lds r5,fpul 1056 float fpul,dr2 1057 fdiv dr2,dr0 1058 ftrc dr0,fpul 1059 lds r2,fpscr 1060 rts 1061 mov.l @r15+,r3 1062#else 1063/* On SH4A we can use the fpchg instruction to flip the FPSCR.PR bit. */ 1064 fpchg 1065 lds r4,fpul 1066 float fpul,dr0 1067 lds r5,fpul 1068 float fpul,dr2 1069 fdiv dr2,dr0 1070 ftrc dr0,fpul 1071 rts 1072 fpchg 1073 1074#endif /* __SH4A__ */ 1075 1076 ENDFUNC(GLOBAL(sdivsi3_i4)) 1077#endif /* ! __SH5__ || __SH5__ == 32 */ 1078#endif /* ! __SH4__ || __SH2A__ */ 1079#endif /* L_sdivsi3_i4 */ 1080 1081//------------------------------------------------------------------------------ 1082#ifdef L_sdivsi3 1083/* __SH4_SINGLE_ONLY__ keeps this part for link compatibility with 1084 sh2e/sh3e code. */ 1085!! 1086!! Steve Chamberlain 1087!! sac@cygnus.com 1088!! 1089!! 1090 1091!! args in r4 and r5, result in r0 clobber r1, r2, r3, and t bit 1092 1093 .global GLOBAL(sdivsi3) 1094#if __SHMEDIA__ 1095#if __SH5__ == 32 1096 .section .text..SHmedia32,"ax" 1097#else 1098 .text 1099#endif 1100 .align 2 1101#if 0 1102/* The assembly code that follows is a hand-optimized version of the C 1103 code that follows. Note that the registers that are modified are 1104 exactly those listed as clobbered in the patterns divsi3_i1 and 1105 divsi3_i1_media. 1106 1107int __sdivsi3 (i, j) 1108 int i, j; 1109{ 1110 register unsigned long long r18 asm ("r18"); 1111 register unsigned long long r19 asm ("r19"); 1112 register unsigned long long r0 asm ("r0") = 0; 1113 register unsigned long long r1 asm ("r1") = 1; 1114 register int r2 asm ("r2") = i >> 31; 1115 register int r3 asm ("r3") = j >> 31; 1116 1117 r2 = r2 ? r2 : r1; 1118 r3 = r3 ? r3 : r1; 1119 r18 = i * r2; 1120 r19 = j * r3; 1121 r2 *= r3; 1122 1123 r19 <<= 31; 1124 r1 <<= 31; 1125 do 1126 if (r18 >= r19) 1127 r0 |= r1, r18 -= r19; 1128 while (r19 >>= 1, r1 >>= 1); 1129 1130 return r2 * (int)r0; 1131} 1132*/ 1133GLOBAL(sdivsi3): 1134 pt/l LOCAL(sdivsi3_dontadd), tr2 1135 pt/l LOCAL(sdivsi3_loop), tr1 1136 ptabs/l r18, tr0 1137 movi 0, r0 1138 movi 1, r1 1139 shari.l r4, 31, r2 1140 shari.l r5, 31, r3 1141 cmveq r2, r1, r2 1142 cmveq r3, r1, r3 1143 muls.l r4, r2, r18 1144 muls.l r5, r3, r19 1145 muls.l r2, r3, r2 1146 shlli r19, 31, r19 1147 shlli r1, 31, r1 1148LOCAL(sdivsi3_loop): 1149 bgtu r19, r18, tr2 1150 or r0, r1, r0 1151 sub r18, r19, r18 1152LOCAL(sdivsi3_dontadd): 1153 shlri r1, 1, r1 1154 shlri r19, 1, r19 1155 bnei r1, 0, tr1 1156 muls.l r0, r2, r0 1157 add.l r0, r63, r0 1158 blink tr0, r63 1159#elif 0 /* ! 0 */ 1160 // inputs: r4,r5 1161 // clobbered: r1,r2,r3,r18,r19,r20,r21,r25,tr0 1162 // result in r0 1163GLOBAL(sdivsi3): 1164 // can create absolute value without extra latency, 1165 // but dependent on proper sign extension of inputs: 1166 // shari.l r5,31,r2 1167 // xor r5,r2,r20 1168 // sub r20,r2,r20 // r20 is now absolute value of r5, zero-extended. 1169 shari.l r5,31,r2 1170 ori r2,1,r2 1171 muls.l r5,r2,r20 // r20 is now absolute value of r5, zero-extended. 1172 movi 0xffffffffffffbb0c,r19 // shift count eqiv 76 1173 shari.l r4,31,r3 1174 nsb r20,r0 1175 shlld r20,r0,r25 1176 shlri r25,48,r25 1177 sub r19,r25,r1 1178 mmulfx.w r1,r1,r2 1179 mshflo.w r1,r63,r1 1180 // If r4 was to be used in-place instead of r21, could use this sequence 1181 // to compute absolute: 1182 // sub r63,r4,r19 // compute absolute value of r4 1183 // shlri r4,32,r3 // into lower 32 bit of r4, keeping 1184 // mcmv r19,r3,r4 // the sign in the upper 32 bits intact. 1185 ori r3,1,r3 1186 mmulfx.w r25,r2,r2 1187 sub r19,r0,r0 1188 muls.l r4,r3,r21 1189 msub.w r1,r2,r2 1190 addi r2,-2,r1 1191 mulu.l r21,r1,r19 1192 mmulfx.w r2,r2,r2 1193 shlli r1,15,r1 1194 shlrd r19,r0,r19 1195 mulu.l r19,r20,r3 1196 mmacnfx.wl r25,r2,r1 1197 ptabs r18,tr0 1198 sub r21,r3,r25 1199 1200 mulu.l r25,r1,r2 1201 addi r0,14,r0 1202 xor r4,r5,r18 1203 shlrd r2,r0,r2 1204 mulu.l r2,r20,r3 1205 add r19,r2,r19 1206 shari.l r18,31,r18 1207 sub r25,r3,r25 1208 1209 mulu.l r25,r1,r2 1210 sub r25,r20,r25 1211 add r19,r18,r19 1212 shlrd r2,r0,r2 1213 mulu.l r2,r20,r3 1214 addi r25,1,r25 1215 add r19,r2,r19 1216 1217 cmpgt r25,r3,r25 1218 add.l r19,r25,r0 1219 xor r0,r18,r0 1220 blink tr0,r63 1221#else /* ! 0 && ! 0 */ 1222 1223 // inputs: r4,r5 1224 // clobbered: r1,r18,r19,r20,r21,r25,tr0 1225 // result in r0 1226 HIDDEN_FUNC(GLOBAL(sdivsi3_2)) 1227#ifndef __pic__ 1228 FUNC(GLOBAL(sdivsi3)) 1229GLOBAL(sdivsi3): /* this is the shcompact entry point */ 1230 // The special SHmedia entry point sdivsi3_1 prevents accidental linking 1231 // with the SHcompact implementation, which clobbers tr1 / tr2. 1232 .global GLOBAL(sdivsi3_1) 1233GLOBAL(sdivsi3_1): 1234 .global GLOBAL(div_table_internal) 1235 movi (GLOBAL(div_table_internal) >> 16) & 65535, r20 1236 shori GLOBAL(div_table_internal) & 65535, r20 1237#endif 1238 .global GLOBAL(sdivsi3_2) 1239 // div_table in r20 1240 // clobbered: r1,r18,r19,r21,r25,tr0 1241GLOBAL(sdivsi3_2): 1242 nsb r5, r1 1243 shlld r5, r1, r25 // normalize; [-2 ..1, 1..2) in s2.62 1244 shari r25, 58, r21 // extract 5(6) bit index (s2.4 with hole -1..1) 1245 ldx.ub r20, r21, r19 // u0.8 1246 shari r25, 32, r25 // normalize to s2.30 1247 shlli r21, 1, r21 1248 muls.l r25, r19, r19 // s2.38 1249 ldx.w r20, r21, r21 // s2.14 1250 ptabs r18, tr0 1251 shari r19, 24, r19 // truncate to s2.14 1252 sub r21, r19, r19 // some 11 bit inverse in s1.14 1253 muls.l r19, r19, r21 // u0.28 1254 sub r63, r1, r1 1255 addi r1, 92, r1 1256 muls.l r25, r21, r18 // s2.58 1257 shlli r19, 45, r19 // multiply by two and convert to s2.58 1258 /* bubble */ 1259 sub r19, r18, r18 1260 shari r18, 28, r18 // some 22 bit inverse in s1.30 1261 muls.l r18, r25, r0 // s2.60 1262 muls.l r18, r4, r25 // s32.30 1263 /* bubble */ 1264 shari r0, 16, r19 // s-16.44 1265 muls.l r19, r18, r19 // s-16.74 1266 shari r25, 63, r0 1267 shari r4, 14, r18 // s19.-14 1268 shari r19, 30, r19 // s-16.44 1269 muls.l r19, r18, r19 // s15.30 1270 xor r21, r0, r21 // You could also use the constant 1 << 27. 1271 add r21, r25, r21 1272 sub r21, r19, r21 1273 shard r21, r1, r21 1274 sub r21, r0, r0 1275 blink tr0, r63 1276#ifndef __pic__ 1277 ENDFUNC(GLOBAL(sdivsi3)) 1278#endif 1279 ENDFUNC(GLOBAL(sdivsi3_2)) 1280#endif 1281#elif __SHMEDIA__ 1282/* m5compact-nofpu */ 1283 // clobbered: r18,r19,r20,r21,r25,tr0,tr1,tr2 1284 .mode SHmedia 1285 .section .text..SHmedia32,"ax" 1286 .align 2 1287 FUNC(GLOBAL(sdivsi3)) 1288GLOBAL(sdivsi3): 1289 pt/l LOCAL(sdivsi3_dontsub), tr0 1290 pt/l LOCAL(sdivsi3_loop), tr1 1291 ptabs/l r18,tr2 1292 shari.l r4,31,r18 1293 shari.l r5,31,r19 1294 xor r4,r18,r20 1295 xor r5,r19,r21 1296 sub.l r20,r18,r20 1297 sub.l r21,r19,r21 1298 xor r18,r19,r19 1299 shlli r21,32,r25 1300 addi r25,-1,r21 1301 addz.l r20,r63,r20 1302LOCAL(sdivsi3_loop): 1303 shlli r20,1,r20 1304 bgeu/u r21,r20,tr0 1305 sub r20,r21,r20 1306LOCAL(sdivsi3_dontsub): 1307 addi.l r25,-1,r25 1308 bnei r25,-32,tr1 1309 xor r20,r19,r20 1310 sub.l r20,r19,r0 1311 blink tr2,r63 1312 ENDFUNC(GLOBAL(sdivsi3)) 1313#else /* ! __SHMEDIA__ */ 1314 FUNC(GLOBAL(sdivsi3)) 1315GLOBAL(sdivsi3): 1316 mov r4,r1 1317 mov r5,r0 1318 1319 tst r0,r0 1320 bt div0 1321 mov #0,r2 1322 div0s r2,r1 1323 subc r3,r3 1324 subc r2,r1 1325 div0s r0,r3 1326 rotcl r1 1327 div1 r0,r3 1328 rotcl r1 1329 div1 r0,r3 1330 rotcl r1 1331 div1 r0,r3 1332 rotcl r1 1333 div1 r0,r3 1334 rotcl r1 1335 div1 r0,r3 1336 rotcl r1 1337 div1 r0,r3 1338 rotcl r1 1339 div1 r0,r3 1340 rotcl r1 1341 div1 r0,r3 1342 rotcl r1 1343 div1 r0,r3 1344 rotcl r1 1345 div1 r0,r3 1346 rotcl r1 1347 div1 r0,r3 1348 rotcl r1 1349 div1 r0,r3 1350 rotcl r1 1351 div1 r0,r3 1352 rotcl r1 1353 div1 r0,r3 1354 rotcl r1 1355 div1 r0,r3 1356 rotcl r1 1357 div1 r0,r3 1358 rotcl r1 1359 div1 r0,r3 1360 rotcl r1 1361 div1 r0,r3 1362 rotcl r1 1363 div1 r0,r3 1364 rotcl r1 1365 div1 r0,r3 1366 rotcl r1 1367 div1 r0,r3 1368 rotcl r1 1369 div1 r0,r3 1370 rotcl r1 1371 div1 r0,r3 1372 rotcl r1 1373 div1 r0,r3 1374 rotcl r1 1375 div1 r0,r3 1376 rotcl r1 1377 div1 r0,r3 1378 rotcl r1 1379 div1 r0,r3 1380 rotcl r1 1381 div1 r0,r3 1382 rotcl r1 1383 div1 r0,r3 1384 rotcl r1 1385 div1 r0,r3 1386 rotcl r1 1387 div1 r0,r3 1388 rotcl r1 1389 div1 r0,r3 1390 rotcl r1 1391 addc r2,r1 1392 rts 1393 mov r1,r0 1394 1395 1396div0: rts 1397 mov #0,r0 1398 1399 ENDFUNC(GLOBAL(sdivsi3)) 1400#endif /* ! __SHMEDIA__ */ 1401#endif /* L_sdivsi3 */ 1402 1403/*------------------------------------------------------------------------------ 1404 32 bit unsigned integer division that uses FPU double precision division. */ 1405 1406#ifdef L_udivsi3_i4 1407 .title "SH DIVIDE" 1408 1409#if defined (__SH4__) || defined (__SH2A__) 1410/* This variant is used when FPSCR.PR = 1 (double precision) is the default 1411 setting. 1412 Args in r4 and r5, result in fpul, 1413 clobber r0, r1, r4, r5, dr0, dr2, dr4, and t bit */ 1414 1415 .global GLOBAL(udivsi3_i4) 1416 HIDDEN_FUNC(GLOBAL(udivsi3_i4)) 1417GLOBAL(udivsi3_i4): 1418 mov #1,r1 1419 cmp/hi r1,r5 1420 bf/s trivial 1421 rotr r1 1422 xor r1,r4 1423 lds r4,fpul 1424 mova L1,r0 1425#ifdef FMOVD_WORKS 1426 fmov.d @r0+,dr4 1427#else 1428 fmov.s @r0+,DR40 1429 fmov.s @r0,DR41 1430#endif 1431 float fpul,dr0 1432 xor r1,r5 1433 lds r5,fpul 1434 float fpul,dr2 1435 fadd dr4,dr0 1436 fadd dr4,dr2 1437 fdiv dr2,dr0 1438 rts 1439 ftrc dr0,fpul 1440 1441trivial: 1442 rts 1443 lds r4,fpul 1444 1445 .align 2 1446#ifdef FMOVD_WORKS 1447 .align 3 // Make the double below 8 byte aligned. 1448#endif 1449L1: 1450 .double 2147483648 1451 1452 ENDFUNC(GLOBAL(udivsi3_i4)) 1453 1454#elif defined (__SH5__) && ! defined (__SH4_NOFPU__) && ! defined (__SH2A_NOFPU__) 1455#if ! __SH5__ || __SH5__ == 32 1456!! args in r4 and r5, result in fpul, clobber r20, r21, dr0, fr33 1457 .mode SHmedia 1458 .global GLOBAL(udivsi3_i4) 1459 HIDDEN_FUNC(GLOBAL(udivsi3_i4)) 1460GLOBAL(udivsi3_i4): 1461 addz.l r4,r63,r20 1462 addz.l r5,r63,r21 1463 fmov.qd r20,dr0 1464 fmov.qd r21,dr32 1465 ptabs r18,tr0 1466 float.qd dr0,dr0 1467 float.qd dr32,dr32 1468 fdiv.d dr0,dr32,dr0 1469 ftrc.dq dr0,dr32 1470 fmov.s fr33,fr32 1471 blink tr0,r63 1472 1473 ENDFUNC(GLOBAL(udivsi3_i4)) 1474#endif /* ! __SH5__ || __SH5__ == 32 */ 1475 1476#elif defined (__SH2A_SINGLE__) || defined (__SH2A_SINGLE_ONLY__) || defined(__SH4_SINGLE__) || defined(__SH4_SINGLE_ONLY__) 1477/* This variant is used when FPSCR.PR = 0 (sigle precision) is the default 1478 setting. 1479 Args in r4 and r5, result in fpul, 1480 clobber r0, r1, r4, r5, dr0, dr2, dr4. 1481 For this to work, we must temporarily switch the FPU do double precision, 1482 but we better do not touch FPSCR.FR. See PR 6526. */ 1483 1484 .global GLOBAL(udivsi3_i4) 1485 HIDDEN_FUNC(GLOBAL(udivsi3_i4)) 1486GLOBAL(udivsi3_i4): 1487 1488#ifndef __SH4A__ 1489 mov #1,r1 1490 cmp/hi r1,r5 1491 bf/s trivial 1492 rotr r1 // r1 = 1 << 31 1493 sts.l fpscr,@-r15 1494 xor r1,r4 1495 mov.l @(0,r15),r0 1496 xor r1,r5 1497 mov.l L2,r1 1498 lds r4,fpul 1499 or r0,r1 1500 mova L1,r0 1501 lds r1,fpscr 1502#ifdef FMOVD_WORKS 1503 fmov.d @r0+,dr4 1504#else 1505 fmov.s @r0+,DR40 1506 fmov.s @r0,DR41 1507#endif 1508 float fpul,dr0 1509 lds r5,fpul 1510 float fpul,dr2 1511 fadd dr4,dr0 1512 fadd dr4,dr2 1513 fdiv dr2,dr0 1514 ftrc dr0,fpul 1515 rts 1516 lds.l @r15+,fpscr 1517 1518#ifdef FMOVD_WORKS 1519 .align 3 // Make the double below 8 byte aligned. 1520#endif 1521trivial: 1522 rts 1523 lds r4,fpul 1524 1525 .align 2 1526L2: 1527#ifdef FMOVD_WORKS 1528 .long 0x180000 // FPSCR.PR = 1, FPSCR.SZ = 1 1529#else 1530 .long 0x80000 // FPSCR.PR = 1 1531#endif 1532L1: 1533 .double 2147483648 1534 1535#else 1536/* On SH4A we can use the fpchg instruction to flip the FPSCR.PR bit. 1537 Although on SH4A fmovd usually works, it would require either additional 1538 two fschg instructions or an FPSCR push + pop. It's not worth the effort 1539 for loading only one double constant. */ 1540 mov #1,r1 1541 cmp/hi r1,r5 1542 bf/s trivial 1543 rotr r1 // r1 = 1 << 31 1544 fpchg 1545 mova L1,r0 1546 xor r1,r4 1547 fmov.s @r0+,DR40 1548 lds r4,fpul 1549 fmov.s @r0,DR41 1550 xor r1,r5 1551 float fpul,dr0 1552 lds r5,fpul 1553 float fpul,dr2 1554 fadd dr4,dr0 1555 fadd dr4,dr2 1556 fdiv dr2,dr0 1557 ftrc dr0,fpul 1558 rts 1559 fpchg 1560 1561trivial: 1562 rts 1563 lds r4,fpul 1564 1565 .align 2 1566L1: 1567 .double 2147483648 1568 1569#endif /* __SH4A__ */ 1570 1571 1572 ENDFUNC(GLOBAL(udivsi3_i4)) 1573#endif /* ! __SH4__ */ 1574#endif /* L_udivsi3_i4 */ 1575 1576#ifdef L_udivsi3 1577/* __SH4_SINGLE_ONLY__ keeps this part for link compatibility with 1578 sh2e/sh3e code. */ 1579 1580!! args in r4 and r5, result in r0, clobbers r4, pr, and t bit 1581 .global GLOBAL(udivsi3) 1582 HIDDEN_FUNC(GLOBAL(udivsi3)) 1583 1584#if __SHMEDIA__ 1585#if __SH5__ == 32 1586 .section .text..SHmedia32,"ax" 1587#else 1588 .text 1589#endif 1590 .align 2 1591#if 0 1592/* The assembly code that follows is a hand-optimized version of the C 1593 code that follows. Note that the registers that are modified are 1594 exactly those listed as clobbered in the patterns udivsi3_i1 and 1595 udivsi3_i1_media. 1596 1597unsigned 1598__udivsi3 (i, j) 1599 unsigned i, j; 1600{ 1601 register unsigned long long r0 asm ("r0") = 0; 1602 register unsigned long long r18 asm ("r18") = 1; 1603 register unsigned long long r4 asm ("r4") = i; 1604 register unsigned long long r19 asm ("r19") = j; 1605 1606 r19 <<= 31; 1607 r18 <<= 31; 1608 do 1609 if (r4 >= r19) 1610 r0 |= r18, r4 -= r19; 1611 while (r19 >>= 1, r18 >>= 1); 1612 1613 return r0; 1614} 1615*/ 1616GLOBAL(udivsi3): 1617 pt/l LOCAL(udivsi3_dontadd), tr2 1618 pt/l LOCAL(udivsi3_loop), tr1 1619 ptabs/l r18, tr0 1620 movi 0, r0 1621 movi 1, r18 1622 addz.l r5, r63, r19 1623 addz.l r4, r63, r4 1624 shlli r19, 31, r19 1625 shlli r18, 31, r18 1626LOCAL(udivsi3_loop): 1627 bgtu r19, r4, tr2 1628 or r0, r18, r0 1629 sub r4, r19, r4 1630LOCAL(udivsi3_dontadd): 1631 shlri r18, 1, r18 1632 shlri r19, 1, r19 1633 bnei r18, 0, tr1 1634 blink tr0, r63 1635#else 1636GLOBAL(udivsi3): 1637 // inputs: r4,r5 1638 // clobbered: r18,r19,r20,r21,r22,r25,tr0 1639 // result in r0. 1640 addz.l r5,r63,r22 1641 nsb r22,r0 1642 shlld r22,r0,r25 1643 shlri r25,48,r25 1644 movi 0xffffffffffffbb0c,r20 // shift count eqiv 76 1645 sub r20,r25,r21 1646 mmulfx.w r21,r21,r19 1647 mshflo.w r21,r63,r21 1648 ptabs r18,tr0 1649 mmulfx.w r25,r19,r19 1650 sub r20,r0,r0 1651 /* bubble */ 1652 msub.w r21,r19,r19 1653 addi r19,-2,r21 /* It would be nice for scheduling to do this add to r21 1654 before the msub.w, but we need a different value for 1655 r19 to keep errors under control. */ 1656 mulu.l r4,r21,r18 1657 mmulfx.w r19,r19,r19 1658 shlli r21,15,r21 1659 shlrd r18,r0,r18 1660 mulu.l r18,r22,r20 1661 mmacnfx.wl r25,r19,r21 1662 /* bubble */ 1663 sub r4,r20,r25 1664 1665 mulu.l r25,r21,r19 1666 addi r0,14,r0 1667 /* bubble */ 1668 shlrd r19,r0,r19 1669 mulu.l r19,r22,r20 1670 add r18,r19,r18 1671 /* bubble */ 1672 sub.l r25,r20,r25 1673 1674 mulu.l r25,r21,r19 1675 addz.l r25,r63,r25 1676 sub r25,r22,r25 1677 shlrd r19,r0,r19 1678 mulu.l r19,r22,r20 1679 addi r25,1,r25 1680 add r18,r19,r18 1681 1682 cmpgt r25,r20,r25 1683 add.l r18,r25,r0 1684 blink tr0,r63 1685#endif 1686#elif __SHMEDIA__ 1687/* m5compact-nofpu - more emphasis on code size than on speed, but don't 1688 ignore speed altogether - div1 needs 9 cycles, subc 7 and rotcl 4. 1689 So use a short shmedia loop. */ 1690 // clobbered: r20,r21,r25,tr0,tr1,tr2 1691 .mode SHmedia 1692 .section .text..SHmedia32,"ax" 1693 .align 2 1694GLOBAL(udivsi3): 1695 pt/l LOCAL(udivsi3_dontsub), tr0 1696 pt/l LOCAL(udivsi3_loop), tr1 1697 ptabs/l r18,tr2 1698 shlli r5,32,r25 1699 addi r25,-1,r21 1700 addz.l r4,r63,r20 1701LOCAL(udivsi3_loop): 1702 shlli r20,1,r20 1703 bgeu/u r21,r20,tr0 1704 sub r20,r21,r20 1705LOCAL(udivsi3_dontsub): 1706 addi.l r25,-1,r25 1707 bnei r25,-32,tr1 1708 add.l r20,r63,r0 1709 blink tr2,r63 1710#else /* ! __SHMEDIA__ */ 1711LOCAL(div8): 1712 div1 r5,r4 1713LOCAL(div7): 1714 div1 r5,r4; div1 r5,r4; div1 r5,r4 1715 div1 r5,r4; div1 r5,r4; div1 r5,r4; rts; div1 r5,r4 1716 1717LOCAL(divx4): 1718 div1 r5,r4; rotcl r0 1719 div1 r5,r4; rotcl r0 1720 div1 r5,r4; rotcl r0 1721 rts; div1 r5,r4 1722 1723GLOBAL(udivsi3): 1724 sts.l pr,@-r15 1725 extu.w r5,r0 1726 cmp/eq r5,r0 1727#ifdef __sh1__ 1728 bf LOCAL(large_divisor) 1729#else 1730 bf/s LOCAL(large_divisor) 1731#endif 1732 div0u 1733 swap.w r4,r0 1734 shlr16 r4 1735 bsr LOCAL(div8) 1736 shll16 r5 1737 bsr LOCAL(div7) 1738 div1 r5,r4 1739 xtrct r4,r0 1740 xtrct r0,r4 1741 bsr LOCAL(div8) 1742 swap.w r4,r4 1743 bsr LOCAL(div7) 1744 div1 r5,r4 1745 lds.l @r15+,pr 1746 xtrct r4,r0 1747 swap.w r0,r0 1748 rotcl r0 1749 rts 1750 shlr16 r5 1751 1752LOCAL(large_divisor): 1753#ifdef __sh1__ 1754 div0u 1755#endif 1756 mov #0,r0 1757 xtrct r4,r0 1758 xtrct r0,r4 1759 bsr LOCAL(divx4) 1760 rotcl r0 1761 bsr LOCAL(divx4) 1762 rotcl r0 1763 bsr LOCAL(divx4) 1764 rotcl r0 1765 bsr LOCAL(divx4) 1766 rotcl r0 1767 lds.l @r15+,pr 1768 rts 1769 rotcl r0 1770 1771 ENDFUNC(GLOBAL(udivsi3)) 1772#endif /* ! __SHMEDIA__ */ 1773#endif /* L_udivsi3 */ 1774 1775#ifdef L_udivdi3 1776#if __SHMEDIA__ 1777 .mode SHmedia 1778 .section .text..SHmedia32,"ax" 1779 .align 2 1780 .global GLOBAL(udivdi3) 1781 FUNC(GLOBAL(udivdi3)) 1782GLOBAL(udivdi3): 1783 HIDDEN_ALIAS(udivdi3_internal,udivdi3) 1784 shlri r3,1,r4 1785 nsb r4,r22 1786 shlld r3,r22,r6 1787 shlri r6,49,r5 1788 movi 0xffffffffffffbaf1,r21 /* .l shift count 17. */ 1789 sub r21,r5,r1 1790 mmulfx.w r1,r1,r4 1791 mshflo.w r1,r63,r1 1792 sub r63,r22,r20 // r63 == 64 % 64 1793 mmulfx.w r5,r4,r4 1794 pta LOCAL(large_divisor),tr0 1795 addi r20,32,r9 1796 msub.w r1,r4,r1 1797 madd.w r1,r1,r1 1798 mmulfx.w r1,r1,r4 1799 shlri r6,32,r7 1800 bgt/u r9,r63,tr0 // large_divisor 1801 mmulfx.w r5,r4,r4 1802 shlri r2,32+14,r19 1803 addi r22,-31,r0 1804 msub.w r1,r4,r1 1805 1806 mulu.l r1,r7,r4 1807 addi r1,-3,r5 1808 mulu.l r5,r19,r5 1809 sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2 1810 shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as 1811 the case may be, %0000000000000000 000.11111111111, still */ 1812 muls.l r1,r4,r4 /* leaving at least one sign bit. */ 1813 mulu.l r5,r3,r8 1814 mshalds.l r1,r21,r1 1815 shari r4,26,r4 1816 shlld r8,r0,r8 1817 add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5) 1818 sub r2,r8,r2 1819 /* Can do second step of 64 : 32 div now, using r1 and the rest in r2. */ 1820 1821 shlri r2,22,r21 1822 mulu.l r21,r1,r21 1823 shlld r5,r0,r8 1824 addi r20,30-22,r0 1825 shlrd r21,r0,r21 1826 mulu.l r21,r3,r5 1827 add r8,r21,r8 1828 mcmpgt.l r21,r63,r21 // See Note 1 1829 addi r20,30,r0 1830 mshfhi.l r63,r21,r21 1831 sub r2,r5,r2 1832 andc r2,r21,r2 1833 1834 /* small divisor: need a third divide step */ 1835 mulu.l r2,r1,r7 1836 ptabs r18,tr0 1837 addi r2,1,r2 1838 shlrd r7,r0,r7 1839 mulu.l r7,r3,r5 1840 add r8,r7,r8 1841 sub r2,r3,r2 1842 cmpgt r2,r5,r5 1843 add r8,r5,r2 1844 /* could test r3 here to check for divide by zero. */ 1845 blink tr0,r63 1846 1847LOCAL(large_divisor): 1848 mmulfx.w r5,r4,r4 1849 shlrd r2,r9,r25 1850 shlri r25,32,r8 1851 msub.w r1,r4,r1 1852 1853 mulu.l r1,r7,r4 1854 addi r1,-3,r5 1855 mulu.l r5,r8,r5 1856 sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2 1857 shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as 1858 the case may be, %0000000000000000 000.11111111111, still */ 1859 muls.l r1,r4,r4 /* leaving at least one sign bit. */ 1860 shlri r5,14-1,r8 1861 mulu.l r8,r7,r5 1862 mshalds.l r1,r21,r1 1863 shari r4,26,r4 1864 add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5) 1865 sub r25,r5,r25 1866 /* Can do second step of 64 : 32 div now, using r1 and the rest in r25. */ 1867 1868 shlri r25,22,r21 1869 mulu.l r21,r1,r21 1870 pta LOCAL(no_lo_adj),tr0 1871 addi r22,32,r0 1872 shlri r21,40,r21 1873 mulu.l r21,r7,r5 1874 add r8,r21,r8 1875 shlld r2,r0,r2 1876 sub r25,r5,r25 1877 bgtu/u r7,r25,tr0 // no_lo_adj 1878 addi r8,1,r8 1879 sub r25,r7,r25 1880LOCAL(no_lo_adj): 1881 mextr4 r2,r25,r2 1882 1883 /* large_divisor: only needs a few adjustments. */ 1884 mulu.l r8,r6,r5 1885 ptabs r18,tr0 1886 /* bubble */ 1887 cmpgtu r5,r2,r5 1888 sub r8,r5,r2 1889 blink tr0,r63 1890 ENDFUNC(GLOBAL(udivdi3)) 1891/* Note 1: To shift the result of the second divide stage so that the result 1892 always fits into 32 bits, yet we still reduce the rest sufficiently 1893 would require a lot of instructions to do the shifts just right. Using 1894 the full 64 bit shift result to multiply with the divisor would require 1895 four extra instructions for the upper 32 bits (shift / mulu / shift / sub). 1896 Fortunately, if the upper 32 bits of the shift result are nonzero, we 1897 know that the rest after taking this partial result into account will 1898 fit into 32 bits. So we just clear the upper 32 bits of the rest if the 1899 upper 32 bits of the partial result are nonzero. */ 1900#endif /* __SHMEDIA__ */ 1901#endif /* L_udivdi3 */ 1902 1903#ifdef L_divdi3 1904#if __SHMEDIA__ 1905 .mode SHmedia 1906 .section .text..SHmedia32,"ax" 1907 .align 2 1908 .global GLOBAL(divdi3) 1909 FUNC(GLOBAL(divdi3)) 1910GLOBAL(divdi3): 1911 pta GLOBAL(udivdi3_internal),tr0 1912 shari r2,63,r22 1913 shari r3,63,r23 1914 xor r2,r22,r2 1915 xor r3,r23,r3 1916 sub r2,r22,r2 1917 sub r3,r23,r3 1918 beq/u r22,r23,tr0 1919 ptabs r18,tr1 1920 blink tr0,r18 1921 sub r63,r2,r2 1922 blink tr1,r63 1923 ENDFUNC(GLOBAL(divdi3)) 1924#endif /* __SHMEDIA__ */ 1925#endif /* L_divdi3 */ 1926 1927#ifdef L_umoddi3 1928#if __SHMEDIA__ 1929 .mode SHmedia 1930 .section .text..SHmedia32,"ax" 1931 .align 2 1932 .global GLOBAL(umoddi3) 1933 FUNC(GLOBAL(umoddi3)) 1934GLOBAL(umoddi3): 1935 HIDDEN_ALIAS(umoddi3_internal,umoddi3) 1936 shlri r3,1,r4 1937 nsb r4,r22 1938 shlld r3,r22,r6 1939 shlri r6,49,r5 1940 movi 0xffffffffffffbaf1,r21 /* .l shift count 17. */ 1941 sub r21,r5,r1 1942 mmulfx.w r1,r1,r4 1943 mshflo.w r1,r63,r1 1944 sub r63,r22,r20 // r63 == 64 % 64 1945 mmulfx.w r5,r4,r4 1946 pta LOCAL(large_divisor),tr0 1947 addi r20,32,r9 1948 msub.w r1,r4,r1 1949 madd.w r1,r1,r1 1950 mmulfx.w r1,r1,r4 1951 shlri r6,32,r7 1952 bgt/u r9,r63,tr0 // large_divisor 1953 mmulfx.w r5,r4,r4 1954 shlri r2,32+14,r19 1955 addi r22,-31,r0 1956 msub.w r1,r4,r1 1957 1958 mulu.l r1,r7,r4 1959 addi r1,-3,r5 1960 mulu.l r5,r19,r5 1961 sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2 1962 shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as 1963 the case may be, %0000000000000000 000.11111111111, still */ 1964 muls.l r1,r4,r4 /* leaving at least one sign bit. */ 1965 mulu.l r5,r3,r5 1966 mshalds.l r1,r21,r1 1967 shari r4,26,r4 1968 shlld r5,r0,r5 1969 add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5) 1970 sub r2,r5,r2 1971 /* Can do second step of 64 : 32 div now, using r1 and the rest in r2. */ 1972 1973 shlri r2,22,r21 1974 mulu.l r21,r1,r21 1975 addi r20,30-22,r0 1976 /* bubble */ /* could test r3 here to check for divide by zero. */ 1977 shlrd r21,r0,r21 1978 mulu.l r21,r3,r5 1979 mcmpgt.l r21,r63,r21 // See Note 1 1980 addi r20,30,r0 1981 mshfhi.l r63,r21,r21 1982 sub r2,r5,r2 1983 andc r2,r21,r2 1984 1985 /* small divisor: need a third divide step */ 1986 mulu.l r2,r1,r7 1987 ptabs r18,tr0 1988 sub r2,r3,r8 /* re-use r8 here for rest - r3 */ 1989 shlrd r7,r0,r7 1990 mulu.l r7,r3,r5 1991 /* bubble */ 1992 addi r8,1,r7 1993 cmpgt r7,r5,r7 1994 cmvne r7,r8,r2 1995 sub r2,r5,r2 1996 blink tr0,r63 1997 1998LOCAL(large_divisor): 1999 mmulfx.w r5,r4,r4 2000 shlrd r2,r9,r25 2001 shlri r25,32,r8 2002 msub.w r1,r4,r1 2003 2004 mulu.l r1,r7,r4 2005 addi r1,-3,r5 2006 mulu.l r5,r8,r5 2007 sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2 2008 shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as 2009 the case may be, %0000000000000000 000.11111111111, still */ 2010 muls.l r1,r4,r4 /* leaving at least one sign bit. */ 2011 shlri r5,14-1,r8 2012 mulu.l r8,r7,r5 2013 mshalds.l r1,r21,r1 2014 shari r4,26,r4 2015 add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5) 2016 sub r25,r5,r25 2017 /* Can do second step of 64 : 32 div now, using r1 and the rest in r25. */ 2018 2019 shlri r25,22,r21 2020 mulu.l r21,r1,r21 2021 pta LOCAL(no_lo_adj),tr0 2022 addi r22,32,r0 2023 shlri r21,40,r21 2024 mulu.l r21,r7,r5 2025 add r8,r21,r8 2026 shlld r2,r0,r2 2027 sub r25,r5,r25 2028 bgtu/u r7,r25,tr0 // no_lo_adj 2029 addi r8,1,r8 2030 sub r25,r7,r25 2031LOCAL(no_lo_adj): 2032 mextr4 r2,r25,r2 2033 2034 /* large_divisor: only needs a few adjustments. */ 2035 mulu.l r8,r6,r5 2036 ptabs r18,tr0 2037 add r2,r6,r7 2038 cmpgtu r5,r2,r8 2039 cmvne r8,r7,r2 2040 sub r2,r5,r2 2041 shlrd r2,r22,r2 2042 blink tr0,r63 2043 ENDFUNC(GLOBAL(umoddi3)) 2044/* Note 1: To shift the result of the second divide stage so that the result 2045 always fits into 32 bits, yet we still reduce the rest sufficiently 2046 would require a lot of instructions to do the shifts just right. Using 2047 the full 64 bit shift result to multiply with the divisor would require 2048 four extra instructions for the upper 32 bits (shift / mulu / shift / sub). 2049 Fortunately, if the upper 32 bits of the shift result are nonzero, we 2050 know that the rest after taking this partial result into account will 2051 fit into 32 bits. So we just clear the upper 32 bits of the rest if the 2052 upper 32 bits of the partial result are nonzero. */ 2053#endif /* __SHMEDIA__ */ 2054#endif /* L_umoddi3 */ 2055 2056#ifdef L_moddi3 2057#if __SHMEDIA__ 2058 .mode SHmedia 2059 .section .text..SHmedia32,"ax" 2060 .align 2 2061 .global GLOBAL(moddi3) 2062 FUNC(GLOBAL(moddi3)) 2063GLOBAL(moddi3): 2064 pta GLOBAL(umoddi3_internal),tr0 2065 shari r2,63,r22 2066 shari r3,63,r23 2067 xor r2,r22,r2 2068 xor r3,r23,r3 2069 sub r2,r22,r2 2070 sub r3,r23,r3 2071 beq/u r22,r63,tr0 2072 ptabs r18,tr1 2073 blink tr0,r18 2074 sub r63,r2,r2 2075 blink tr1,r63 2076 ENDFUNC(GLOBAL(moddi3)) 2077#endif /* __SHMEDIA__ */ 2078#endif /* L_moddi3 */ 2079 2080#ifdef L_set_fpscr 2081#if !defined (__SH2A_NOFPU__) 2082#if defined (__SH2E__) || defined (__SH2A__) || defined (__SH3E__) || defined(__SH4_SINGLE__) || defined(__SH4__) || defined(__SH4_SINGLE_ONLY__) || __SH5__ == 32 2083#ifdef __SH5__ 2084 .mode SHcompact 2085#endif 2086 .global GLOBAL(set_fpscr) 2087 HIDDEN_FUNC(GLOBAL(set_fpscr)) 2088GLOBAL(set_fpscr): 2089 lds r4,fpscr 2090#ifdef __PIC__ 2091 mov.l r12,@-r15 2092#ifdef __vxworks 2093 mov.l LOCAL(set_fpscr_L0_base),r12 2094 mov.l LOCAL(set_fpscr_L0_index),r0 2095 mov.l @r12,r12 2096 mov.l @(r0,r12),r12 2097#else 2098 mova LOCAL(set_fpscr_L0),r0 2099 mov.l LOCAL(set_fpscr_L0),r12 2100 add r0,r12 2101#endif 2102 mov.l LOCAL(set_fpscr_L1),r0 2103 mov.l @(r0,r12),r1 2104 mov.l @r15+,r12 2105#else 2106 mov.l LOCAL(set_fpscr_L1),r1 2107#endif 2108 swap.w r4,r0 2109 or #24,r0 2110#ifndef FMOVD_WORKS 2111 xor #16,r0 2112#endif 2113#if defined(__SH4__) || defined (__SH2A_DOUBLE__) 2114 swap.w r0,r3 2115 mov.l r3,@(4,r1) 2116#else /* defined (__SH2E__) || defined(__SH3E__) || defined(__SH4_SINGLE*__) */ 2117 swap.w r0,r2 2118 mov.l r2,@r1 2119#endif 2120#ifndef FMOVD_WORKS 2121 xor #8,r0 2122#else 2123 xor #24,r0 2124#endif 2125#if defined(__SH4__) || defined (__SH2A_DOUBLE__) 2126 swap.w r0,r2 2127 rts 2128 mov.l r2,@r1 2129#else /* defined(__SH2E__) || defined(__SH3E__) || defined(__SH4_SINGLE*__) */ 2130 swap.w r0,r3 2131 rts 2132 mov.l r3,@(4,r1) 2133#endif 2134 .align 2 2135#ifdef __PIC__ 2136#ifdef __vxworks 2137LOCAL(set_fpscr_L0_base): 2138 .long ___GOTT_BASE__ 2139LOCAL(set_fpscr_L0_index): 2140 .long ___GOTT_INDEX__ 2141#else 2142LOCAL(set_fpscr_L0): 2143 .long _GLOBAL_OFFSET_TABLE_ 2144#endif 2145LOCAL(set_fpscr_L1): 2146 .long GLOBAL(fpscr_values@GOT) 2147#else 2148LOCAL(set_fpscr_L1): 2149 .long GLOBAL(fpscr_values) 2150#endif 2151 2152 ENDFUNC(GLOBAL(set_fpscr)) 2153#ifndef NO_FPSCR_VALUES 2154#ifdef __ELF__ 2155 .comm GLOBAL(fpscr_values),8,4 2156#else 2157 .comm GLOBAL(fpscr_values),8 2158#endif /* ELF */ 2159#endif /* NO_FPSCR_VALUES */ 2160#endif /* SH2E / SH3E / SH4 */ 2161#endif /* __SH2A_NOFPU__ */ 2162#endif /* L_set_fpscr */ 2163#ifdef L_ic_invalidate 2164#if __SH5__ == 32 2165 .mode SHmedia 2166 .section .text..SHmedia32,"ax" 2167 .align 2 2168 .global GLOBAL(init_trampoline) 2169 HIDDEN_FUNC(GLOBAL(init_trampoline)) 2170GLOBAL(init_trampoline): 2171 st.l r0,8,r2 2172#ifdef __LITTLE_ENDIAN__ 2173 movi 9,r20 2174 shori 0x402b,r20 2175 shori 0xd101,r20 2176 shori 0xd002,r20 2177#else 2178 movi 0xffffffffffffd002,r20 2179 shori 0xd101,r20 2180 shori 0x402b,r20 2181 shori 9,r20 2182#endif 2183 st.q r0,0,r20 2184 st.l r0,12,r3 2185 ENDFUNC(GLOBAL(init_trampoline)) 2186 .global GLOBAL(ic_invalidate) 2187 HIDDEN_FUNC(GLOBAL(ic_invalidate)) 2188GLOBAL(ic_invalidate): 2189 ocbwb r0,0 2190 synco 2191 icbi r0, 0 2192 ptabs r18, tr0 2193 synci 2194 blink tr0, r63 2195 ENDFUNC(GLOBAL(ic_invalidate)) 2196#elif defined(__SH4A__) 2197 .global GLOBAL(ic_invalidate) 2198 HIDDEN_FUNC(GLOBAL(ic_invalidate)) 2199GLOBAL(ic_invalidate): 2200 ocbwb @r4 2201 synco 2202 icbi @r4 2203 rts 2204 nop 2205 ENDFUNC(GLOBAL(ic_invalidate)) 2206#elif defined(__SH4_SINGLE__) || defined(__SH4__) || defined(__SH4_SINGLE_ONLY__) || (defined(__SH4_NOFPU__) && !defined(__SH5__)) 2207 /* For system code, we use ic_invalidate_line_i, but user code 2208 needs a different mechanism. A kernel call is generally not 2209 available, and it would also be slow. Different SH4 variants use 2210 different sizes and associativities of the Icache. We use a small 2211 bit of dispatch code that can be put hidden in every shared object, 2212 which calls the actual processor-specific invalidation code in a 2213 separate module. 2214 Or if you have operating system support, the OS could mmap the 2215 procesor-specific code from a single page, since it is highly 2216 repetitive. */ 2217 .global GLOBAL(ic_invalidate) 2218 HIDDEN_FUNC(GLOBAL(ic_invalidate)) 2219GLOBAL(ic_invalidate): 2220#ifdef __pic__ 2221#ifdef __vxworks 2222 mov.l 1f,r1 2223 mov.l 2f,r0 2224 mov.l @r1,r1 2225 mov.l 0f,r2 2226 mov.l @(r0,r1),r0 2227#else 2228 mov.l 1f,r1 2229 mova 1f,r0 2230 mov.l 0f,r2 2231 add r1,r0 2232#endif 2233 mov.l @(r0,r2),r1 2234#else 2235 mov.l 0f,r1 2236#endif 2237 ocbwb @r4 2238 mov.l @(8,r1),r0 2239 sub r1,r4 2240 and r4,r0 2241 add r1,r0 2242 jmp @r0 2243 mov.l @(4,r1),r0 2244 .align 2 2245#ifndef __pic__ 22460: .long GLOBAL(ic_invalidate_array) 2247#else /* __pic__ */ 2248 .global GLOBAL(ic_invalidate_array) 22490: .long GLOBAL(ic_invalidate_array)@GOT 2250#ifdef __vxworks 22511: .long ___GOTT_BASE__ 22522: .long ___GOTT_INDEX__ 2253#else 22541: .long _GLOBAL_OFFSET_TABLE_ 2255#endif 2256 ENDFUNC(GLOBAL(ic_invalidate)) 2257#endif /* __pic__ */ 2258#endif /* SH4 */ 2259#endif /* L_ic_invalidate */ 2260 2261#ifdef L_ic_invalidate_array 2262#if defined(__SH4A__) || (defined (__FORCE_SH4A__) && (defined(__SH4_SINGLE__) || defined(__SH4__) || defined(__SH4_SINGLE_ONLY__) || (defined(__SH4_NOFPU__) && !defined(__SH5__)))) 2263 .global GLOBAL(ic_invalidate_array) 2264 /* This is needed when an SH4 dso with trampolines is used on SH4A. */ 2265 .global GLOBAL(ic_invalidate_array) 2266 FUNC(GLOBAL(ic_invalidate_array)) 2267GLOBAL(ic_invalidate_array): 2268 add r1,r4 2269 synco 2270 icbi @r4 2271 rts 2272 nop 2273 .align 2 2274 .long 0 2275 ENDFUNC(GLOBAL(ic_invalidate_array)) 2276#elif defined(__SH4_SINGLE__) || defined(__SH4__) || defined(__SH4_SINGLE_ONLY__) || (defined(__SH4_NOFPU__) && !defined(__SH5__)) 2277 .global GLOBAL(ic_invalidate_array) 2278 .p2align 5 2279 FUNC(GLOBAL(ic_invalidate_array)) 2280/* This must be aligned to the beginning of a cache line. */ 2281GLOBAL(ic_invalidate_array): 2282#ifndef WAYS 2283#define WAYS 4 2284#define WAY_SIZE 0x4000 2285#endif 2286#if WAYS == 1 2287 .rept WAY_SIZE * WAYS / 32 2288 rts 2289 nop 2290 .rept 7 2291 .long WAY_SIZE - 32 2292 .endr 2293 .endr 2294#elif WAYS <= 6 2295 .rept WAY_SIZE * WAYS / 32 2296 braf r0 2297 add #-8,r0 2298 .long WAY_SIZE + 8 2299 .long WAY_SIZE - 32 2300 .rept WAYS-2 2301 braf r0 2302 nop 2303 .endr 2304 .rept 7 - WAYS 2305 rts 2306 nop 2307 .endr 2308 .endr 2309#else /* WAYS > 6 */ 2310 /* This variant needs two different pages for mmap-ing. */ 2311 .rept WAYS-1 2312 .rept WAY_SIZE / 32 2313 braf r0 2314 nop 2315 .long WAY_SIZE 2316 .rept 6 2317 .long WAY_SIZE - 32 2318 .endr 2319 .endr 2320 .endr 2321 .rept WAY_SIZE / 32 2322 rts 2323 .rept 15 2324 nop 2325 .endr 2326 .endr 2327#endif /* WAYS */ 2328 ENDFUNC(GLOBAL(ic_invalidate_array)) 2329#endif /* SH4 */ 2330#endif /* L_ic_invalidate_array */ 2331 2332#if defined (__SH5__) && __SH5__ == 32 2333#ifdef L_shcompact_call_trampoline 2334 .section .rodata 2335 .align 1 2336LOCAL(ct_main_table): 2337.word LOCAL(ct_r2_fp) - datalabel LOCAL(ct_main_label) 2338.word LOCAL(ct_r2_ld) - datalabel LOCAL(ct_main_label) 2339.word LOCAL(ct_r2_pop) - datalabel LOCAL(ct_main_label) 2340.word LOCAL(ct_r3_fp) - datalabel LOCAL(ct_main_label) 2341.word LOCAL(ct_r3_ld) - datalabel LOCAL(ct_main_label) 2342.word LOCAL(ct_r3_pop) - datalabel LOCAL(ct_main_label) 2343.word LOCAL(ct_r4_fp) - datalabel LOCAL(ct_main_label) 2344.word LOCAL(ct_r4_ld) - datalabel LOCAL(ct_main_label) 2345.word LOCAL(ct_r4_pop) - datalabel LOCAL(ct_main_label) 2346.word LOCAL(ct_r5_fp) - datalabel LOCAL(ct_main_label) 2347.word LOCAL(ct_r5_ld) - datalabel LOCAL(ct_main_label) 2348.word LOCAL(ct_r5_pop) - datalabel LOCAL(ct_main_label) 2349.word LOCAL(ct_r6_fph) - datalabel LOCAL(ct_main_label) 2350.word LOCAL(ct_r6_fpl) - datalabel LOCAL(ct_main_label) 2351.word LOCAL(ct_r6_ld) - datalabel LOCAL(ct_main_label) 2352.word LOCAL(ct_r6_pop) - datalabel LOCAL(ct_main_label) 2353.word LOCAL(ct_r7_fph) - datalabel LOCAL(ct_main_label) 2354.word LOCAL(ct_r7_fpl) - datalabel LOCAL(ct_main_label) 2355.word LOCAL(ct_r7_ld) - datalabel LOCAL(ct_main_label) 2356.word LOCAL(ct_r7_pop) - datalabel LOCAL(ct_main_label) 2357.word LOCAL(ct_r8_fph) - datalabel LOCAL(ct_main_label) 2358.word LOCAL(ct_r8_fpl) - datalabel LOCAL(ct_main_label) 2359.word LOCAL(ct_r8_ld) - datalabel LOCAL(ct_main_label) 2360.word LOCAL(ct_r8_pop) - datalabel LOCAL(ct_main_label) 2361.word LOCAL(ct_r9_fph) - datalabel LOCAL(ct_main_label) 2362.word LOCAL(ct_r9_fpl) - datalabel LOCAL(ct_main_label) 2363.word LOCAL(ct_r9_ld) - datalabel LOCAL(ct_main_label) 2364.word LOCAL(ct_r9_pop) - datalabel LOCAL(ct_main_label) 2365.word LOCAL(ct_pop_seq) - datalabel LOCAL(ct_main_label) 2366.word LOCAL(ct_pop_seq) - datalabel LOCAL(ct_main_label) 2367.word LOCAL(ct_r9_pop) - datalabel LOCAL(ct_main_label) 2368.word LOCAL(ct_ret_wide) - datalabel LOCAL(ct_main_label) 2369.word LOCAL(ct_call_func) - datalabel LOCAL(ct_main_label) 2370 .mode SHmedia 2371 .section .text..SHmedia32, "ax" 2372 .align 2 2373 2374 /* This function loads 64-bit general-purpose registers from the 2375 stack, from a memory address contained in them or from an FP 2376 register, according to a cookie passed in r1. Its execution 2377 time is linear on the number of registers that actually have 2378 to be copied. See sh.h for details on the actual bit pattern. 2379 2380 The function to be called is passed in r0. If a 32-bit return 2381 value is expected, the actual function will be tail-called, 2382 otherwise the return address will be stored in r10 (that the 2383 caller should expect to be clobbered) and the return value 2384 will be expanded into r2/r3 upon return. */ 2385 2386 .global GLOBAL(GCC_shcompact_call_trampoline) 2387 FUNC(GLOBAL(GCC_shcompact_call_trampoline)) 2388GLOBAL(GCC_shcompact_call_trampoline): 2389 ptabs/l r0, tr0 /* Prepare to call the actual function. */ 2390 movi ((datalabel LOCAL(ct_main_table) - 31 * 2) >> 16) & 65535, r0 2391 pt/l LOCAL(ct_loop), tr1 2392 addz.l r1, r63, r1 2393 shori ((datalabel LOCAL(ct_main_table) - 31 * 2)) & 65535, r0 2394LOCAL(ct_loop): 2395 nsb r1, r28 2396 shlli r28, 1, r29 2397 ldx.w r0, r29, r30 2398LOCAL(ct_main_label): 2399 ptrel/l r30, tr2 2400 blink tr2, r63 2401LOCAL(ct_r2_fp): /* Copy r2 from an FP register. */ 2402 /* It must be dr0, so just do it. */ 2403 fmov.dq dr0, r2 2404 movi 7, r30 2405 shlli r30, 29, r31 2406 andc r1, r31, r1 2407 blink tr1, r63 2408LOCAL(ct_r3_fp): /* Copy r3 from an FP register. */ 2409 /* It is either dr0 or dr2. */ 2410 movi 7, r30 2411 shlri r1, 26, r32 2412 shlli r30, 26, r31 2413 andc r1, r31, r1 2414 fmov.dq dr0, r3 2415 beqi/l r32, 4, tr1 2416 fmov.dq dr2, r3 2417 blink tr1, r63 2418LOCAL(ct_r4_fp): /* Copy r4 from an FP register. */ 2419 shlri r1, 23 - 3, r34 2420 andi r34, 3 << 3, r33 2421 addi r33, LOCAL(ct_r4_fp_copy) - datalabel LOCAL(ct_r4_fp_base), r32 2422LOCAL(ct_r4_fp_base): 2423 ptrel/l r32, tr2 2424 movi 7, r30 2425 shlli r30, 23, r31 2426 andc r1, r31, r1 2427 blink tr2, r63 2428LOCAL(ct_r4_fp_copy): 2429 fmov.dq dr0, r4 2430 blink tr1, r63 2431 fmov.dq dr2, r4 2432 blink tr1, r63 2433 fmov.dq dr4, r4 2434 blink tr1, r63 2435LOCAL(ct_r5_fp): /* Copy r5 from an FP register. */ 2436 shlri r1, 20 - 3, r34 2437 andi r34, 3 << 3, r33 2438 addi r33, LOCAL(ct_r5_fp_copy) - datalabel LOCAL(ct_r5_fp_base), r32 2439LOCAL(ct_r5_fp_base): 2440 ptrel/l r32, tr2 2441 movi 7, r30 2442 shlli r30, 20, r31 2443 andc r1, r31, r1 2444 blink tr2, r63 2445LOCAL(ct_r5_fp_copy): 2446 fmov.dq dr0, r5 2447 blink tr1, r63 2448 fmov.dq dr2, r5 2449 blink tr1, r63 2450 fmov.dq dr4, r5 2451 blink tr1, r63 2452 fmov.dq dr6, r5 2453 blink tr1, r63 2454LOCAL(ct_r6_fph): /* Copy r6 from a high FP register. */ 2455 /* It must be dr8. */ 2456 fmov.dq dr8, r6 2457 movi 15, r30 2458 shlli r30, 16, r31 2459 andc r1, r31, r1 2460 blink tr1, r63 2461LOCAL(ct_r6_fpl): /* Copy r6 from a low FP register. */ 2462 shlri r1, 16 - 3, r34 2463 andi r34, 3 << 3, r33 2464 addi r33, LOCAL(ct_r6_fp_copy) - datalabel LOCAL(ct_r6_fp_base), r32 2465LOCAL(ct_r6_fp_base): 2466 ptrel/l r32, tr2 2467 movi 7, r30 2468 shlli r30, 16, r31 2469 andc r1, r31, r1 2470 blink tr2, r63 2471LOCAL(ct_r6_fp_copy): 2472 fmov.dq dr0, r6 2473 blink tr1, r63 2474 fmov.dq dr2, r6 2475 blink tr1, r63 2476 fmov.dq dr4, r6 2477 blink tr1, r63 2478 fmov.dq dr6, r6 2479 blink tr1, r63 2480LOCAL(ct_r7_fph): /* Copy r7 from a high FP register. */ 2481 /* It is either dr8 or dr10. */ 2482 movi 15 << 12, r31 2483 shlri r1, 12, r32 2484 andc r1, r31, r1 2485 fmov.dq dr8, r7 2486 beqi/l r32, 8, tr1 2487 fmov.dq dr10, r7 2488 blink tr1, r63 2489LOCAL(ct_r7_fpl): /* Copy r7 from a low FP register. */ 2490 shlri r1, 12 - 3, r34 2491 andi r34, 3 << 3, r33 2492 addi r33, LOCAL(ct_r7_fp_copy) - datalabel LOCAL(ct_r7_fp_base), r32 2493LOCAL(ct_r7_fp_base): 2494 ptrel/l r32, tr2 2495 movi 7 << 12, r31 2496 andc r1, r31, r1 2497 blink tr2, r63 2498LOCAL(ct_r7_fp_copy): 2499 fmov.dq dr0, r7 2500 blink tr1, r63 2501 fmov.dq dr2, r7 2502 blink tr1, r63 2503 fmov.dq dr4, r7 2504 blink tr1, r63 2505 fmov.dq dr6, r7 2506 blink tr1, r63 2507LOCAL(ct_r8_fph): /* Copy r8 from a high FP register. */ 2508 /* It is either dr8 or dr10. */ 2509 movi 15 << 8, r31 2510 andi r1, 1 << 8, r32 2511 andc r1, r31, r1 2512 fmov.dq dr8, r8 2513 beq/l r32, r63, tr1 2514 fmov.dq dr10, r8 2515 blink tr1, r63 2516LOCAL(ct_r8_fpl): /* Copy r8 from a low FP register. */ 2517 shlri r1, 8 - 3, r34 2518 andi r34, 3 << 3, r33 2519 addi r33, LOCAL(ct_r8_fp_copy) - datalabel LOCAL(ct_r8_fp_base), r32 2520LOCAL(ct_r8_fp_base): 2521 ptrel/l r32, tr2 2522 movi 7 << 8, r31 2523 andc r1, r31, r1 2524 blink tr2, r63 2525LOCAL(ct_r8_fp_copy): 2526 fmov.dq dr0, r8 2527 blink tr1, r63 2528 fmov.dq dr2, r8 2529 blink tr1, r63 2530 fmov.dq dr4, r8 2531 blink tr1, r63 2532 fmov.dq dr6, r8 2533 blink tr1, r63 2534LOCAL(ct_r9_fph): /* Copy r9 from a high FP register. */ 2535 /* It is either dr8 or dr10. */ 2536 movi 15 << 4, r31 2537 andi r1, 1 << 4, r32 2538 andc r1, r31, r1 2539 fmov.dq dr8, r9 2540 beq/l r32, r63, tr1 2541 fmov.dq dr10, r9 2542 blink tr1, r63 2543LOCAL(ct_r9_fpl): /* Copy r9 from a low FP register. */ 2544 shlri r1, 4 - 3, r34 2545 andi r34, 3 << 3, r33 2546 addi r33, LOCAL(ct_r9_fp_copy) - datalabel LOCAL(ct_r9_fp_base), r32 2547LOCAL(ct_r9_fp_base): 2548 ptrel/l r32, tr2 2549 movi 7 << 4, r31 2550 andc r1, r31, r1 2551 blink tr2, r63 2552LOCAL(ct_r9_fp_copy): 2553 fmov.dq dr0, r9 2554 blink tr1, r63 2555 fmov.dq dr2, r9 2556 blink tr1, r63 2557 fmov.dq dr4, r9 2558 blink tr1, r63 2559 fmov.dq dr6, r9 2560 blink tr1, r63 2561LOCAL(ct_r2_ld): /* Copy r2 from a memory address. */ 2562 pt/l LOCAL(ct_r2_load), tr2 2563 movi 3, r30 2564 shlli r30, 29, r31 2565 and r1, r31, r32 2566 andc r1, r31, r1 2567 beq/l r31, r32, tr2 2568 addi.l r2, 8, r3 2569 ldx.q r2, r63, r2 2570 /* Fall through. */ 2571LOCAL(ct_r3_ld): /* Copy r3 from a memory address. */ 2572 pt/l LOCAL(ct_r3_load), tr2 2573 movi 3, r30 2574 shlli r30, 26, r31 2575 and r1, r31, r32 2576 andc r1, r31, r1 2577 beq/l r31, r32, tr2 2578 addi.l r3, 8, r4 2579 ldx.q r3, r63, r3 2580LOCAL(ct_r4_ld): /* Copy r4 from a memory address. */ 2581 pt/l LOCAL(ct_r4_load), tr2 2582 movi 3, r30 2583 shlli r30, 23, r31 2584 and r1, r31, r32 2585 andc r1, r31, r1 2586 beq/l r31, r32, tr2 2587 addi.l r4, 8, r5 2588 ldx.q r4, r63, r4 2589LOCAL(ct_r5_ld): /* Copy r5 from a memory address. */ 2590 pt/l LOCAL(ct_r5_load), tr2 2591 movi 3, r30 2592 shlli r30, 20, r31 2593 and r1, r31, r32 2594 andc r1, r31, r1 2595 beq/l r31, r32, tr2 2596 addi.l r5, 8, r6 2597 ldx.q r5, r63, r5 2598LOCAL(ct_r6_ld): /* Copy r6 from a memory address. */ 2599 pt/l LOCAL(ct_r6_load), tr2 2600 movi 3 << 16, r31 2601 and r1, r31, r32 2602 andc r1, r31, r1 2603 beq/l r31, r32, tr2 2604 addi.l r6, 8, r7 2605 ldx.q r6, r63, r6 2606LOCAL(ct_r7_ld): /* Copy r7 from a memory address. */ 2607 pt/l LOCAL(ct_r7_load), tr2 2608 movi 3 << 12, r31 2609 and r1, r31, r32 2610 andc r1, r31, r1 2611 beq/l r31, r32, tr2 2612 addi.l r7, 8, r8 2613 ldx.q r7, r63, r7 2614LOCAL(ct_r8_ld): /* Copy r8 from a memory address. */ 2615 pt/l LOCAL(ct_r8_load), tr2 2616 movi 3 << 8, r31 2617 and r1, r31, r32 2618 andc r1, r31, r1 2619 beq/l r31, r32, tr2 2620 addi.l r8, 8, r9 2621 ldx.q r8, r63, r8 2622LOCAL(ct_r9_ld): /* Copy r9 from a memory address. */ 2623 pt/l LOCAL(ct_check_tramp), tr2 2624 ldx.q r9, r63, r9 2625 blink tr2, r63 2626LOCAL(ct_r2_load): 2627 ldx.q r2, r63, r2 2628 blink tr1, r63 2629LOCAL(ct_r3_load): 2630 ldx.q r3, r63, r3 2631 blink tr1, r63 2632LOCAL(ct_r4_load): 2633 ldx.q r4, r63, r4 2634 blink tr1, r63 2635LOCAL(ct_r5_load): 2636 ldx.q r5, r63, r5 2637 blink tr1, r63 2638LOCAL(ct_r6_load): 2639 ldx.q r6, r63, r6 2640 blink tr1, r63 2641LOCAL(ct_r7_load): 2642 ldx.q r7, r63, r7 2643 blink tr1, r63 2644LOCAL(ct_r8_load): 2645 ldx.q r8, r63, r8 2646 blink tr1, r63 2647LOCAL(ct_r2_pop): /* Pop r2 from the stack. */ 2648 movi 1, r30 2649 ldx.q r15, r63, r2 2650 shlli r30, 29, r31 2651 addi.l r15, 8, r15 2652 andc r1, r31, r1 2653 blink tr1, r63 2654LOCAL(ct_r3_pop): /* Pop r3 from the stack. */ 2655 movi 1, r30 2656 ldx.q r15, r63, r3 2657 shlli r30, 26, r31 2658 addi.l r15, 8, r15 2659 andc r1, r31, r1 2660 blink tr1, r63 2661LOCAL(ct_r4_pop): /* Pop r4 from the stack. */ 2662 movi 1, r30 2663 ldx.q r15, r63, r4 2664 shlli r30, 23, r31 2665 addi.l r15, 8, r15 2666 andc r1, r31, r1 2667 blink tr1, r63 2668LOCAL(ct_r5_pop): /* Pop r5 from the stack. */ 2669 movi 1, r30 2670 ldx.q r15, r63, r5 2671 shlli r30, 20, r31 2672 addi.l r15, 8, r15 2673 andc r1, r31, r1 2674 blink tr1, r63 2675LOCAL(ct_r6_pop): /* Pop r6 from the stack. */ 2676 movi 1, r30 2677 ldx.q r15, r63, r6 2678 shlli r30, 16, r31 2679 addi.l r15, 8, r15 2680 andc r1, r31, r1 2681 blink tr1, r63 2682LOCAL(ct_r7_pop): /* Pop r7 from the stack. */ 2683 ldx.q r15, r63, r7 2684 movi 1 << 12, r31 2685 addi.l r15, 8, r15 2686 andc r1, r31, r1 2687 blink tr1, r63 2688LOCAL(ct_r8_pop): /* Pop r8 from the stack. */ 2689 ldx.q r15, r63, r8 2690 movi 1 << 8, r31 2691 addi.l r15, 8, r15 2692 andc r1, r31, r1 2693 blink tr1, r63 2694LOCAL(ct_pop_seq): /* Pop a sequence of registers off the stack. */ 2695 andi r1, 7 << 1, r30 2696 movi (LOCAL(ct_end_of_pop_seq) >> 16) & 65535, r32 2697 shlli r30, 2, r31 2698 shori LOCAL(ct_end_of_pop_seq) & 65535, r32 2699 sub.l r32, r31, r33 2700 ptabs/l r33, tr2 2701 blink tr2, r63 2702LOCAL(ct_start_of_pop_seq): /* Beginning of pop sequence. */ 2703 ldx.q r15, r63, r3 2704 addi.l r15, 8, r15 2705 ldx.q r15, r63, r4 2706 addi.l r15, 8, r15 2707 ldx.q r15, r63, r5 2708 addi.l r15, 8, r15 2709 ldx.q r15, r63, r6 2710 addi.l r15, 8, r15 2711 ldx.q r15, r63, r7 2712 addi.l r15, 8, r15 2713 ldx.q r15, r63, r8 2714 addi.l r15, 8, r15 2715LOCAL(ct_r9_pop): /* Pop r9 from the stack. */ 2716 ldx.q r15, r63, r9 2717 addi.l r15, 8, r15 2718LOCAL(ct_end_of_pop_seq): /* Label used to compute first pop instruction. */ 2719LOCAL(ct_check_tramp): /* Check whether we need a trampoline. */ 2720 pt/u LOCAL(ct_ret_wide), tr2 2721 andi r1, 1, r1 2722 bne/u r1, r63, tr2 2723LOCAL(ct_call_func): /* Just branch to the function. */ 2724 blink tr0, r63 2725LOCAL(ct_ret_wide): /* Call the function, so that we can unpack its 2726 64-bit return value. */ 2727 add.l r18, r63, r10 2728 blink tr0, r18 2729 ptabs r10, tr0 2730#if __LITTLE_ENDIAN__ 2731 shari r2, 32, r3 2732 add.l r2, r63, r2 2733#else 2734 add.l r2, r63, r3 2735 shari r2, 32, r2 2736#endif 2737 blink tr0, r63 2738 2739 ENDFUNC(GLOBAL(GCC_shcompact_call_trampoline)) 2740#endif /* L_shcompact_call_trampoline */ 2741 2742#ifdef L_shcompact_return_trampoline 2743 /* This function does the converse of the code in `ret_wide' 2744 above. It is tail-called by SHcompact functions returning 2745 64-bit non-floating-point values, to pack the 32-bit values in 2746 r2 and r3 into r2. */ 2747 2748 .mode SHmedia 2749 .section .text..SHmedia32, "ax" 2750 .align 2 2751 .global GLOBAL(GCC_shcompact_return_trampoline) 2752 HIDDEN_FUNC(GLOBAL(GCC_shcompact_return_trampoline)) 2753GLOBAL(GCC_shcompact_return_trampoline): 2754 ptabs/l r18, tr0 2755#if __LITTLE_ENDIAN__ 2756 addz.l r2, r63, r2 2757 shlli r3, 32, r3 2758#else 2759 addz.l r3, r63, r3 2760 shlli r2, 32, r2 2761#endif 2762 or r3, r2, r2 2763 blink tr0, r63 2764 2765 ENDFUNC(GLOBAL(GCC_shcompact_return_trampoline)) 2766#endif /* L_shcompact_return_trampoline */ 2767 2768#ifdef L_shcompact_incoming_args 2769 .section .rodata 2770 .align 1 2771LOCAL(ia_main_table): 2772.word 1 /* Invalid, just loop */ 2773.word LOCAL(ia_r2_ld) - datalabel LOCAL(ia_main_label) 2774.word LOCAL(ia_r2_push) - datalabel LOCAL(ia_main_label) 2775.word 1 /* Invalid, just loop */ 2776.word LOCAL(ia_r3_ld) - datalabel LOCAL(ia_main_label) 2777.word LOCAL(ia_r3_push) - datalabel LOCAL(ia_main_label) 2778.word 1 /* Invalid, just loop */ 2779.word LOCAL(ia_r4_ld) - datalabel LOCAL(ia_main_label) 2780.word LOCAL(ia_r4_push) - datalabel LOCAL(ia_main_label) 2781.word 1 /* Invalid, just loop */ 2782.word LOCAL(ia_r5_ld) - datalabel LOCAL(ia_main_label) 2783.word LOCAL(ia_r5_push) - datalabel LOCAL(ia_main_label) 2784.word 1 /* Invalid, just loop */ 2785.word 1 /* Invalid, just loop */ 2786.word LOCAL(ia_r6_ld) - datalabel LOCAL(ia_main_label) 2787.word LOCAL(ia_r6_push) - datalabel LOCAL(ia_main_label) 2788.word 1 /* Invalid, just loop */ 2789.word 1 /* Invalid, just loop */ 2790.word LOCAL(ia_r7_ld) - datalabel LOCAL(ia_main_label) 2791.word LOCAL(ia_r7_push) - datalabel LOCAL(ia_main_label) 2792.word 1 /* Invalid, just loop */ 2793.word 1 /* Invalid, just loop */ 2794.word LOCAL(ia_r8_ld) - datalabel LOCAL(ia_main_label) 2795.word LOCAL(ia_r8_push) - datalabel LOCAL(ia_main_label) 2796.word 1 /* Invalid, just loop */ 2797.word 1 /* Invalid, just loop */ 2798.word LOCAL(ia_r9_ld) - datalabel LOCAL(ia_main_label) 2799.word LOCAL(ia_r9_push) - datalabel LOCAL(ia_main_label) 2800.word LOCAL(ia_push_seq) - datalabel LOCAL(ia_main_label) 2801.word LOCAL(ia_push_seq) - datalabel LOCAL(ia_main_label) 2802.word LOCAL(ia_r9_push) - datalabel LOCAL(ia_main_label) 2803.word LOCAL(ia_return) - datalabel LOCAL(ia_main_label) 2804.word LOCAL(ia_return) - datalabel LOCAL(ia_main_label) 2805 .mode SHmedia 2806 .section .text..SHmedia32, "ax" 2807 .align 2 2808 2809 /* This function stores 64-bit general-purpose registers back in 2810 the stack, and loads the address in which each register 2811 was stored into itself. The lower 32 bits of r17 hold the address 2812 to begin storing, and the upper 32 bits of r17 hold the cookie. 2813 Its execution time is linear on the 2814 number of registers that actually have to be copied, and it is 2815 optimized for structures larger than 64 bits, as opposed to 2816 individual `long long' arguments. See sh.h for details on the 2817 actual bit pattern. */ 2818 2819 .global GLOBAL(GCC_shcompact_incoming_args) 2820 FUNC(GLOBAL(GCC_shcompact_incoming_args)) 2821GLOBAL(GCC_shcompact_incoming_args): 2822 ptabs/l r18, tr0 /* Prepare to return. */ 2823 shlri r17, 32, r0 /* Load the cookie. */ 2824 movi ((datalabel LOCAL(ia_main_table) - 31 * 2) >> 16) & 65535, r43 2825 pt/l LOCAL(ia_loop), tr1 2826 add.l r17, r63, r17 2827 shori ((datalabel LOCAL(ia_main_table) - 31 * 2)) & 65535, r43 2828LOCAL(ia_loop): 2829 nsb r0, r36 2830 shlli r36, 1, r37 2831 ldx.w r43, r37, r38 2832LOCAL(ia_main_label): 2833 ptrel/l r38, tr2 2834 blink tr2, r63 2835LOCAL(ia_r2_ld): /* Store r2 and load its address. */ 2836 movi 3, r38 2837 shlli r38, 29, r39 2838 and r0, r39, r40 2839 andc r0, r39, r0 2840 stx.q r17, r63, r2 2841 add.l r17, r63, r2 2842 addi.l r17, 8, r17 2843 beq/u r39, r40, tr1 2844LOCAL(ia_r3_ld): /* Store r3 and load its address. */ 2845 movi 3, r38 2846 shlli r38, 26, r39 2847 and r0, r39, r40 2848 andc r0, r39, r0 2849 stx.q r17, r63, r3 2850 add.l r17, r63, r3 2851 addi.l r17, 8, r17 2852 beq/u r39, r40, tr1 2853LOCAL(ia_r4_ld): /* Store r4 and load its address. */ 2854 movi 3, r38 2855 shlli r38, 23, r39 2856 and r0, r39, r40 2857 andc r0, r39, r0 2858 stx.q r17, r63, r4 2859 add.l r17, r63, r4 2860 addi.l r17, 8, r17 2861 beq/u r39, r40, tr1 2862LOCAL(ia_r5_ld): /* Store r5 and load its address. */ 2863 movi 3, r38 2864 shlli r38, 20, r39 2865 and r0, r39, r40 2866 andc r0, r39, r0 2867 stx.q r17, r63, r5 2868 add.l r17, r63, r5 2869 addi.l r17, 8, r17 2870 beq/u r39, r40, tr1 2871LOCAL(ia_r6_ld): /* Store r6 and load its address. */ 2872 movi 3, r38 2873 shlli r38, 16, r39 2874 and r0, r39, r40 2875 andc r0, r39, r0 2876 stx.q r17, r63, r6 2877 add.l r17, r63, r6 2878 addi.l r17, 8, r17 2879 beq/u r39, r40, tr1 2880LOCAL(ia_r7_ld): /* Store r7 and load its address. */ 2881 movi 3 << 12, r39 2882 and r0, r39, r40 2883 andc r0, r39, r0 2884 stx.q r17, r63, r7 2885 add.l r17, r63, r7 2886 addi.l r17, 8, r17 2887 beq/u r39, r40, tr1 2888LOCAL(ia_r8_ld): /* Store r8 and load its address. */ 2889 movi 3 << 8, r39 2890 and r0, r39, r40 2891 andc r0, r39, r0 2892 stx.q r17, r63, r8 2893 add.l r17, r63, r8 2894 addi.l r17, 8, r17 2895 beq/u r39, r40, tr1 2896LOCAL(ia_r9_ld): /* Store r9 and load its address. */ 2897 stx.q r17, r63, r9 2898 add.l r17, r63, r9 2899 blink tr0, r63 2900LOCAL(ia_r2_push): /* Push r2 onto the stack. */ 2901 movi 1, r38 2902 shlli r38, 29, r39 2903 andc r0, r39, r0 2904 stx.q r17, r63, r2 2905 addi.l r17, 8, r17 2906 blink tr1, r63 2907LOCAL(ia_r3_push): /* Push r3 onto the stack. */ 2908 movi 1, r38 2909 shlli r38, 26, r39 2910 andc r0, r39, r0 2911 stx.q r17, r63, r3 2912 addi.l r17, 8, r17 2913 blink tr1, r63 2914LOCAL(ia_r4_push): /* Push r4 onto the stack. */ 2915 movi 1, r38 2916 shlli r38, 23, r39 2917 andc r0, r39, r0 2918 stx.q r17, r63, r4 2919 addi.l r17, 8, r17 2920 blink tr1, r63 2921LOCAL(ia_r5_push): /* Push r5 onto the stack. */ 2922 movi 1, r38 2923 shlli r38, 20, r39 2924 andc r0, r39, r0 2925 stx.q r17, r63, r5 2926 addi.l r17, 8, r17 2927 blink tr1, r63 2928LOCAL(ia_r6_push): /* Push r6 onto the stack. */ 2929 movi 1, r38 2930 shlli r38, 16, r39 2931 andc r0, r39, r0 2932 stx.q r17, r63, r6 2933 addi.l r17, 8, r17 2934 blink tr1, r63 2935LOCAL(ia_r7_push): /* Push r7 onto the stack. */ 2936 movi 1 << 12, r39 2937 andc r0, r39, r0 2938 stx.q r17, r63, r7 2939 addi.l r17, 8, r17 2940 blink tr1, r63 2941LOCAL(ia_r8_push): /* Push r8 onto the stack. */ 2942 movi 1 << 8, r39 2943 andc r0, r39, r0 2944 stx.q r17, r63, r8 2945 addi.l r17, 8, r17 2946 blink tr1, r63 2947LOCAL(ia_push_seq): /* Push a sequence of registers onto the stack. */ 2948 andi r0, 7 << 1, r38 2949 movi (LOCAL(ia_end_of_push_seq) >> 16) & 65535, r40 2950 shlli r38, 2, r39 2951 shori LOCAL(ia_end_of_push_seq) & 65535, r40 2952 sub.l r40, r39, r41 2953 ptabs/l r41, tr2 2954 blink tr2, r63 2955LOCAL(ia_stack_of_push_seq): /* Beginning of push sequence. */ 2956 stx.q r17, r63, r3 2957 addi.l r17, 8, r17 2958 stx.q r17, r63, r4 2959 addi.l r17, 8, r17 2960 stx.q r17, r63, r5 2961 addi.l r17, 8, r17 2962 stx.q r17, r63, r6 2963 addi.l r17, 8, r17 2964 stx.q r17, r63, r7 2965 addi.l r17, 8, r17 2966 stx.q r17, r63, r8 2967 addi.l r17, 8, r17 2968LOCAL(ia_r9_push): /* Push r9 onto the stack. */ 2969 stx.q r17, r63, r9 2970LOCAL(ia_return): /* Return. */ 2971 blink tr0, r63 2972LOCAL(ia_end_of_push_seq): /* Label used to compute the first push instruction. */ 2973 ENDFUNC(GLOBAL(GCC_shcompact_incoming_args)) 2974#endif /* L_shcompact_incoming_args */ 2975#endif 2976#if __SH5__ 2977#ifdef L_nested_trampoline 2978#if __SH5__ == 32 2979 .section .text..SHmedia32,"ax" 2980#else 2981 .text 2982#endif 2983 .align 3 /* It is copied in units of 8 bytes in SHmedia mode. */ 2984 .global GLOBAL(GCC_nested_trampoline) 2985 HIDDEN_FUNC(GLOBAL(GCC_nested_trampoline)) 2986GLOBAL(GCC_nested_trampoline): 2987 .mode SHmedia 2988 ptrel/u r63, tr0 2989 gettr tr0, r0 2990#if __SH5__ == 64 2991 ld.q r0, 24, r1 2992#else 2993 ld.l r0, 24, r1 2994#endif 2995 ptabs/l r1, tr1 2996#if __SH5__ == 64 2997 ld.q r0, 32, r1 2998#else 2999 ld.l r0, 28, r1 3000#endif 3001 blink tr1, r63 3002 3003 ENDFUNC(GLOBAL(GCC_nested_trampoline)) 3004#endif /* L_nested_trampoline */ 3005#endif /* __SH5__ */ 3006#if __SH5__ == 32 3007#ifdef L_push_pop_shmedia_regs 3008 .section .text..SHmedia32,"ax" 3009 .mode SHmedia 3010 .align 2 3011#ifndef __SH4_NOFPU__ 3012 .global GLOBAL(GCC_push_shmedia_regs) 3013 FUNC(GLOBAL(GCC_push_shmedia_regs)) 3014GLOBAL(GCC_push_shmedia_regs): 3015 addi.l r15, -14*8, r15 3016 fst.d r15, 13*8, dr62 3017 fst.d r15, 12*8, dr60 3018 fst.d r15, 11*8, dr58 3019 fst.d r15, 10*8, dr56 3020 fst.d r15, 9*8, dr54 3021 fst.d r15, 8*8, dr52 3022 fst.d r15, 7*8, dr50 3023 fst.d r15, 6*8, dr48 3024 fst.d r15, 5*8, dr46 3025 fst.d r15, 4*8, dr44 3026 fst.d r15, 3*8, dr42 3027 fst.d r15, 2*8, dr40 3028 fst.d r15, 1*8, dr38 3029 fst.d r15, 0*8, dr36 3030#else /* ! __SH4_NOFPU__ */ 3031 .global GLOBAL(GCC_push_shmedia_regs_nofpu) 3032 FUNC(GLOBAL(GCC_push_shmedia_regs_nofpu)) 3033GLOBAL(GCC_push_shmedia_regs_nofpu): 3034#endif /* ! __SH4_NOFPU__ */ 3035 ptabs/l r18, tr0 3036 addi.l r15, -27*8, r15 3037 gettr tr7, r62 3038 gettr tr6, r61 3039 gettr tr5, r60 3040 st.q r15, 26*8, r62 3041 st.q r15, 25*8, r61 3042 st.q r15, 24*8, r60 3043 st.q r15, 23*8, r59 3044 st.q r15, 22*8, r58 3045 st.q r15, 21*8, r57 3046 st.q r15, 20*8, r56 3047 st.q r15, 19*8, r55 3048 st.q r15, 18*8, r54 3049 st.q r15, 17*8, r53 3050 st.q r15, 16*8, r52 3051 st.q r15, 15*8, r51 3052 st.q r15, 14*8, r50 3053 st.q r15, 13*8, r49 3054 st.q r15, 12*8, r48 3055 st.q r15, 11*8, r47 3056 st.q r15, 10*8, r46 3057 st.q r15, 9*8, r45 3058 st.q r15, 8*8, r44 3059 st.q r15, 7*8, r35 3060 st.q r15, 6*8, r34 3061 st.q r15, 5*8, r33 3062 st.q r15, 4*8, r32 3063 st.q r15, 3*8, r31 3064 st.q r15, 2*8, r30 3065 st.q r15, 1*8, r29 3066 st.q r15, 0*8, r28 3067 blink tr0, r63 3068#ifndef __SH4_NOFPU__ 3069 ENDFUNC(GLOBAL(GCC_push_shmedia_regs)) 3070#else 3071 ENDFUNC(GLOBAL(GCC_push_shmedia_regs_nofpu)) 3072#endif 3073#ifndef __SH4_NOFPU__ 3074 .global GLOBAL(GCC_pop_shmedia_regs) 3075 FUNC(GLOBAL(GCC_pop_shmedia_regs)) 3076GLOBAL(GCC_pop_shmedia_regs): 3077 pt .L0, tr1 3078 movi 41*8, r0 3079 fld.d r15, 40*8, dr62 3080 fld.d r15, 39*8, dr60 3081 fld.d r15, 38*8, dr58 3082 fld.d r15, 37*8, dr56 3083 fld.d r15, 36*8, dr54 3084 fld.d r15, 35*8, dr52 3085 fld.d r15, 34*8, dr50 3086 fld.d r15, 33*8, dr48 3087 fld.d r15, 32*8, dr46 3088 fld.d r15, 31*8, dr44 3089 fld.d r15, 30*8, dr42 3090 fld.d r15, 29*8, dr40 3091 fld.d r15, 28*8, dr38 3092 fld.d r15, 27*8, dr36 3093 blink tr1, r63 3094#else /* ! __SH4_NOFPU__ */ 3095 .global GLOBAL(GCC_pop_shmedia_regs_nofpu) 3096 FUNC(GLOBAL(GCC_pop_shmedia_regs_nofpu)) 3097GLOBAL(GCC_pop_shmedia_regs_nofpu): 3098#endif /* ! __SH4_NOFPU__ */ 3099 movi 27*8, r0 3100.L0: 3101 ptabs r18, tr0 3102 ld.q r15, 26*8, r62 3103 ld.q r15, 25*8, r61 3104 ld.q r15, 24*8, r60 3105 ptabs r62, tr7 3106 ptabs r61, tr6 3107 ptabs r60, tr5 3108 ld.q r15, 23*8, r59 3109 ld.q r15, 22*8, r58 3110 ld.q r15, 21*8, r57 3111 ld.q r15, 20*8, r56 3112 ld.q r15, 19*8, r55 3113 ld.q r15, 18*8, r54 3114 ld.q r15, 17*8, r53 3115 ld.q r15, 16*8, r52 3116 ld.q r15, 15*8, r51 3117 ld.q r15, 14*8, r50 3118 ld.q r15, 13*8, r49 3119 ld.q r15, 12*8, r48 3120 ld.q r15, 11*8, r47 3121 ld.q r15, 10*8, r46 3122 ld.q r15, 9*8, r45 3123 ld.q r15, 8*8, r44 3124 ld.q r15, 7*8, r35 3125 ld.q r15, 6*8, r34 3126 ld.q r15, 5*8, r33 3127 ld.q r15, 4*8, r32 3128 ld.q r15, 3*8, r31 3129 ld.q r15, 2*8, r30 3130 ld.q r15, 1*8, r29 3131 ld.q r15, 0*8, r28 3132 add.l r15, r0, r15 3133 blink tr0, r63 3134 3135#ifndef __SH4_NOFPU__ 3136 ENDFUNC(GLOBAL(GCC_pop_shmedia_regs)) 3137#else 3138 ENDFUNC(GLOBAL(GCC_pop_shmedia_regs_nofpu)) 3139#endif 3140#endif /* __SH5__ == 32 */ 3141#endif /* L_push_pop_shmedia_regs */ 3142 3143#ifdef L_div_table 3144#if __SH5__ 3145#if defined(__pic__) && __SHMEDIA__ 3146 .global GLOBAL(sdivsi3) 3147 FUNC(GLOBAL(sdivsi3)) 3148#if __SH5__ == 32 3149 .section .text..SHmedia32,"ax" 3150#else 3151 .text 3152#endif 3153#if 0 3154/* ??? FIXME: Presumably due to a linker bug, exporting data symbols 3155 in a text section does not work (at least for shared libraries): 3156 the linker sets the LSB of the address as if this was SHmedia code. */ 3157#define TEXT_DATA_BUG 3158#endif 3159 .align 2 3160 // inputs: r4,r5 3161 // clobbered: r1,r18,r19,r20,r21,r25,tr0 3162 // result in r0 3163 .global GLOBAL(sdivsi3) 3164GLOBAL(sdivsi3): 3165#ifdef TEXT_DATA_BUG 3166 ptb datalabel Local_div_table,tr0 3167#else 3168 ptb GLOBAL(div_table_internal),tr0 3169#endif 3170 nsb r5, r1 3171 shlld r5, r1, r25 // normalize; [-2 ..1, 1..2) in s2.62 3172 shari r25, 58, r21 // extract 5(6) bit index (s2.4 with hole -1..1) 3173 /* bubble */ 3174 gettr tr0,r20 3175 ldx.ub r20, r21, r19 // u0.8 3176 shari r25, 32, r25 // normalize to s2.30 3177 shlli r21, 1, r21 3178 muls.l r25, r19, r19 // s2.38 3179 ldx.w r20, r21, r21 // s2.14 3180 ptabs r18, tr0 3181 shari r19, 24, r19 // truncate to s2.14 3182 sub r21, r19, r19 // some 11 bit inverse in s1.14 3183 muls.l r19, r19, r21 // u0.28 3184 sub r63, r1, r1 3185 addi r1, 92, r1 3186 muls.l r25, r21, r18 // s2.58 3187 shlli r19, 45, r19 // multiply by two and convert to s2.58 3188 /* bubble */ 3189 sub r19, r18, r18 3190 shari r18, 28, r18 // some 22 bit inverse in s1.30 3191 muls.l r18, r25, r0 // s2.60 3192 muls.l r18, r4, r25 // s32.30 3193 /* bubble */ 3194 shari r0, 16, r19 // s-16.44 3195 muls.l r19, r18, r19 // s-16.74 3196 shari r25, 63, r0 3197 shari r4, 14, r18 // s19.-14 3198 shari r19, 30, r19 // s-16.44 3199 muls.l r19, r18, r19 // s15.30 3200 xor r21, r0, r21 // You could also use the constant 1 << 27. 3201 add r21, r25, r21 3202 sub r21, r19, r21 3203 shard r21, r1, r21 3204 sub r21, r0, r0 3205 blink tr0, r63 3206 ENDFUNC(GLOBAL(sdivsi3)) 3207/* This table has been generated by divtab.c . 3208Defects for bias -330: 3209 Max defect: 6.081536e-07 at -1.000000e+00 3210 Min defect: 2.849516e-08 at 1.030651e+00 3211 Max 2nd step defect: 9.606539e-12 at -1.000000e+00 3212 Min 2nd step defect: 0.000000e+00 at 0.000000e+00 3213 Defect at 1: 1.238659e-07 3214 Defect at -2: 1.061708e-07 */ 3215#else /* ! __pic__ || ! __SHMEDIA__ */ 3216 .section .rodata 3217#endif /* __pic__ */ 3218#if defined(TEXT_DATA_BUG) && defined(__pic__) && __SHMEDIA__ 3219 .balign 2 3220 .type Local_div_table,@object 3221 .size Local_div_table,128 3222/* negative division constants */ 3223 .word -16638 3224 .word -17135 3225 .word -17737 3226 .word -18433 3227 .word -19103 3228 .word -19751 3229 .word -20583 3230 .word -21383 3231 .word -22343 3232 .word -23353 3233 .word -24407 3234 .word -25582 3235 .word -26863 3236 .word -28382 3237 .word -29965 3238 .word -31800 3239/* negative division factors */ 3240 .byte 66 3241 .byte 70 3242 .byte 75 3243 .byte 81 3244 .byte 87 3245 .byte 93 3246 .byte 101 3247 .byte 109 3248 .byte 119 3249 .byte 130 3250 .byte 142 3251 .byte 156 3252 .byte 172 3253 .byte 192 3254 .byte 214 3255 .byte 241 3256 .skip 16 3257Local_div_table: 3258 .skip 16 3259/* positive division factors */ 3260 .byte 241 3261 .byte 214 3262 .byte 192 3263 .byte 172 3264 .byte 156 3265 .byte 142 3266 .byte 130 3267 .byte 119 3268 .byte 109 3269 .byte 101 3270 .byte 93 3271 .byte 87 3272 .byte 81 3273 .byte 75 3274 .byte 70 3275 .byte 66 3276/* positive division constants */ 3277 .word 31801 3278 .word 29966 3279 .word 28383 3280 .word 26864 3281 .word 25583 3282 .word 24408 3283 .word 23354 3284 .word 22344 3285 .word 21384 3286 .word 20584 3287 .word 19752 3288 .word 19104 3289 .word 18434 3290 .word 17738 3291 .word 17136 3292 .word 16639 3293 .section .rodata 3294#endif /* TEXT_DATA_BUG */ 3295 .balign 2 3296 .type GLOBAL(div_table),@object 3297 .size GLOBAL(div_table),128 3298/* negative division constants */ 3299 .word -16638 3300 .word -17135 3301 .word -17737 3302 .word -18433 3303 .word -19103 3304 .word -19751 3305 .word -20583 3306 .word -21383 3307 .word -22343 3308 .word -23353 3309 .word -24407 3310 .word -25582 3311 .word -26863 3312 .word -28382 3313 .word -29965 3314 .word -31800 3315/* negative division factors */ 3316 .byte 66 3317 .byte 70 3318 .byte 75 3319 .byte 81 3320 .byte 87 3321 .byte 93 3322 .byte 101 3323 .byte 109 3324 .byte 119 3325 .byte 130 3326 .byte 142 3327 .byte 156 3328 .byte 172 3329 .byte 192 3330 .byte 214 3331 .byte 241 3332 .skip 16 3333 .global GLOBAL(div_table) 3334GLOBAL(div_table): 3335 HIDDEN_ALIAS(div_table_internal,div_table) 3336 .skip 16 3337/* positive division factors */ 3338 .byte 241 3339 .byte 214 3340 .byte 192 3341 .byte 172 3342 .byte 156 3343 .byte 142 3344 .byte 130 3345 .byte 119 3346 .byte 109 3347 .byte 101 3348 .byte 93 3349 .byte 87 3350 .byte 81 3351 .byte 75 3352 .byte 70 3353 .byte 66 3354/* positive division constants */ 3355 .word 31801 3356 .word 29966 3357 .word 28383 3358 .word 26864 3359 .word 25583 3360 .word 24408 3361 .word 23354 3362 .word 22344 3363 .word 21384 3364 .word 20584 3365 .word 19752 3366 .word 19104 3367 .word 18434 3368 .word 17738 3369 .word 17136 3370 .word 16639 3371 3372#elif defined (__SH2A__) || defined (__SH3__) || defined (__SH3E__) || defined (__SH4__) || defined (__SH4_SINGLE__) || defined (__SH4_SINGLE_ONLY__) || defined (__SH4_NOFPU__) 3373/* This code uses shld, thus is not suitable for SH1 / SH2. */ 3374 3375/* Signed / unsigned division without use of FPU, optimized for SH4. 3376 Uses a lookup table for divisors in the range -128 .. +128, and 3377 div1 with case distinction for larger divisors in three more ranges. 3378 The code is lumped together with the table to allow the use of mova. */ 3379#ifdef __LITTLE_ENDIAN__ 3380#define L_LSB 0 3381#define L_LSWMSB 1 3382#define L_MSWLSB 2 3383#else 3384#define L_LSB 3 3385#define L_LSWMSB 2 3386#define L_MSWLSB 1 3387#endif 3388 3389 .balign 4 3390 .global GLOBAL(udivsi3_i4i) 3391 FUNC(GLOBAL(udivsi3_i4i)) 3392GLOBAL(udivsi3_i4i): 3393 mov.w LOCAL(c128_w), r1 3394 div0u 3395 mov r4,r0 3396 shlr8 r0 3397 cmp/hi r1,r5 3398 extu.w r5,r1 3399 bf LOCAL(udiv_le128) 3400 cmp/eq r5,r1 3401 bf LOCAL(udiv_ge64k) 3402 shlr r0 3403 mov r5,r1 3404 shll16 r5 3405 mov.l r4,@-r15 3406 div1 r5,r0 3407 mov.l r1,@-r15 3408 div1 r5,r0 3409 div1 r5,r0 3410 bra LOCAL(udiv_25) 3411 div1 r5,r0 3412 3413LOCAL(div_le128): 3414 mova LOCAL(div_table_ix),r0 3415 bra LOCAL(div_le128_2) 3416 mov.b @(r0,r5),r1 3417LOCAL(udiv_le128): 3418 mov.l r4,@-r15 3419 mova LOCAL(div_table_ix),r0 3420 mov.b @(r0,r5),r1 3421 mov.l r5,@-r15 3422LOCAL(div_le128_2): 3423 mova LOCAL(div_table_inv),r0 3424 mov.l @(r0,r1),r1 3425 mov r5,r0 3426 tst #0xfe,r0 3427 mova LOCAL(div_table_clz),r0 3428 dmulu.l r1,r4 3429 mov.b @(r0,r5),r1 3430 bt/s LOCAL(div_by_1) 3431 mov r4,r0 3432 mov.l @r15+,r5 3433 sts mach,r0 3434 /* clrt */ 3435 addc r4,r0 3436 mov.l @r15+,r4 3437 rotcr r0 3438 rts 3439 shld r1,r0 3440 3441LOCAL(div_by_1_neg): 3442 neg r4,r0 3443LOCAL(div_by_1): 3444 mov.l @r15+,r5 3445 rts 3446 mov.l @r15+,r4 3447 3448LOCAL(div_ge64k): 3449 bt/s LOCAL(div_r8) 3450 div0u 3451 shll8 r5 3452 bra LOCAL(div_ge64k_2) 3453 div1 r5,r0 3454LOCAL(udiv_ge64k): 3455 cmp/hi r0,r5 3456 mov r5,r1 3457 bt LOCAL(udiv_r8) 3458 shll8 r5 3459 mov.l r4,@-r15 3460 div1 r5,r0 3461 mov.l r1,@-r15 3462LOCAL(div_ge64k_2): 3463 div1 r5,r0 3464 mov.l LOCAL(zero_l),r1 3465 .rept 4 3466 div1 r5,r0 3467 .endr 3468 mov.l r1,@-r15 3469 div1 r5,r0 3470 mov.w LOCAL(m256_w),r1 3471 div1 r5,r0 3472 mov.b r0,@(L_LSWMSB,r15) 3473 xor r4,r0 3474 and r1,r0 3475 bra LOCAL(div_ge64k_end) 3476 xor r4,r0 3477 3478LOCAL(div_r8): 3479 shll16 r4 3480 bra LOCAL(div_r8_2) 3481 shll8 r4 3482LOCAL(udiv_r8): 3483 mov.l r4,@-r15 3484 shll16 r4 3485 clrt 3486 shll8 r4 3487 mov.l r5,@-r15 3488LOCAL(div_r8_2): 3489 rotcl r4 3490 mov r0,r1 3491 div1 r5,r1 3492 mov r4,r0 3493 rotcl r0 3494 mov r5,r4 3495 div1 r5,r1 3496 .rept 5 3497 rotcl r0; div1 r5,r1 3498 .endr 3499 rotcl r0 3500 mov.l @r15+,r5 3501 div1 r4,r1 3502 mov.l @r15+,r4 3503 rts 3504 rotcl r0 3505 3506 ENDFUNC(GLOBAL(udivsi3_i4i)) 3507 3508 .global GLOBAL(sdivsi3_i4i) 3509 FUNC(GLOBAL(sdivsi3_i4i)) 3510 /* This is link-compatible with a GLOBAL(sdivsi3) call, 3511 but we effectively clobber only r1. */ 3512GLOBAL(sdivsi3_i4i): 3513 mov.l r4,@-r15 3514 cmp/pz r5 3515 mov.w LOCAL(c128_w), r1 3516 bt/s LOCAL(pos_divisor) 3517 cmp/pz r4 3518 mov.l r5,@-r15 3519 neg r5,r5 3520 bt/s LOCAL(neg_result) 3521 cmp/hi r1,r5 3522 neg r4,r4 3523LOCAL(pos_result): 3524 extu.w r5,r0 3525 bf LOCAL(div_le128) 3526 cmp/eq r5,r0 3527 mov r4,r0 3528 shlr8 r0 3529 bf/s LOCAL(div_ge64k) 3530 cmp/hi r0,r5 3531 div0u 3532 shll16 r5 3533 div1 r5,r0 3534 div1 r5,r0 3535 div1 r5,r0 3536LOCAL(udiv_25): 3537 mov.l LOCAL(zero_l),r1 3538 div1 r5,r0 3539 div1 r5,r0 3540 mov.l r1,@-r15 3541 .rept 3 3542 div1 r5,r0 3543 .endr 3544 mov.b r0,@(L_MSWLSB,r15) 3545 xtrct r4,r0 3546 swap.w r0,r0 3547 .rept 8 3548 div1 r5,r0 3549 .endr 3550 mov.b r0,@(L_LSWMSB,r15) 3551LOCAL(div_ge64k_end): 3552 .rept 8 3553 div1 r5,r0 3554 .endr 3555 mov.l @r15+,r4 ! zero-extension and swap using LS unit. 3556 extu.b r0,r0 3557 mov.l @r15+,r5 3558 or r4,r0 3559 mov.l @r15+,r4 3560 rts 3561 rotcl r0 3562 3563LOCAL(div_le128_neg): 3564 tst #0xfe,r0 3565 mova LOCAL(div_table_ix),r0 3566 mov.b @(r0,r5),r1 3567 mova LOCAL(div_table_inv),r0 3568 bt/s LOCAL(div_by_1_neg) 3569 mov.l @(r0,r1),r1 3570 mova LOCAL(div_table_clz),r0 3571 dmulu.l r1,r4 3572 mov.b @(r0,r5),r1 3573 mov.l @r15+,r5 3574 sts mach,r0 3575 /* clrt */ 3576 addc r4,r0 3577 mov.l @r15+,r4 3578 rotcr r0 3579 shld r1,r0 3580 rts 3581 neg r0,r0 3582 3583LOCAL(pos_divisor): 3584 mov.l r5,@-r15 3585 bt/s LOCAL(pos_result) 3586 cmp/hi r1,r5 3587 neg r4,r4 3588LOCAL(neg_result): 3589 extu.w r5,r0 3590 bf LOCAL(div_le128_neg) 3591 cmp/eq r5,r0 3592 mov r4,r0 3593 shlr8 r0 3594 bf/s LOCAL(div_ge64k_neg) 3595 cmp/hi r0,r5 3596 div0u 3597 mov.l LOCAL(zero_l),r1 3598 shll16 r5 3599 div1 r5,r0 3600 mov.l r1,@-r15 3601 .rept 7 3602 div1 r5,r0 3603 .endr 3604 mov.b r0,@(L_MSWLSB,r15) 3605 xtrct r4,r0 3606 swap.w r0,r0 3607 .rept 8 3608 div1 r5,r0 3609 .endr 3610 mov.b r0,@(L_LSWMSB,r15) 3611LOCAL(div_ge64k_neg_end): 3612 .rept 8 3613 div1 r5,r0 3614 .endr 3615 mov.l @r15+,r4 ! zero-extension and swap using LS unit. 3616 extu.b r0,r1 3617 mov.l @r15+,r5 3618 or r4,r1 3619LOCAL(div_r8_neg_end): 3620 mov.l @r15+,r4 3621 rotcl r1 3622 rts 3623 neg r1,r0 3624 3625LOCAL(div_ge64k_neg): 3626 bt/s LOCAL(div_r8_neg) 3627 div0u 3628 shll8 r5 3629 mov.l LOCAL(zero_l),r1 3630 .rept 6 3631 div1 r5,r0 3632 .endr 3633 mov.l r1,@-r15 3634 div1 r5,r0 3635 mov.w LOCAL(m256_w),r1 3636 div1 r5,r0 3637 mov.b r0,@(L_LSWMSB,r15) 3638 xor r4,r0 3639 and r1,r0 3640 bra LOCAL(div_ge64k_neg_end) 3641 xor r4,r0 3642 3643LOCAL(c128_w): 3644 .word 128 3645 3646LOCAL(div_r8_neg): 3647 clrt 3648 shll16 r4 3649 mov r4,r1 3650 shll8 r1 3651 mov r5,r4 3652 .rept 7 3653 rotcl r1; div1 r5,r0 3654 .endr 3655 mov.l @r15+,r5 3656 rotcl r1 3657 bra LOCAL(div_r8_neg_end) 3658 div1 r4,r0 3659 3660LOCAL(m256_w): 3661 .word 0xff00 3662/* This table has been generated by divtab-sh4.c. */ 3663 .balign 4 3664LOCAL(div_table_clz): 3665 .byte 0 3666 .byte 1 3667 .byte 0 3668 .byte -1 3669 .byte -1 3670 .byte -2 3671 .byte -2 3672 .byte -2 3673 .byte -2 3674 .byte -3 3675 .byte -3 3676 .byte -3 3677 .byte -3 3678 .byte -3 3679 .byte -3 3680 .byte -3 3681 .byte -3 3682 .byte -4 3683 .byte -4 3684 .byte -4 3685 .byte -4 3686 .byte -4 3687 .byte -4 3688 .byte -4 3689 .byte -4 3690 .byte -4 3691 .byte -4 3692 .byte -4 3693 .byte -4 3694 .byte -4 3695 .byte -4 3696 .byte -4 3697 .byte -4 3698 .byte -5 3699 .byte -5 3700 .byte -5 3701 .byte -5 3702 .byte -5 3703 .byte -5 3704 .byte -5 3705 .byte -5 3706 .byte -5 3707 .byte -5 3708 .byte -5 3709 .byte -5 3710 .byte -5 3711 .byte -5 3712 .byte -5 3713 .byte -5 3714 .byte -5 3715 .byte -5 3716 .byte -5 3717 .byte -5 3718 .byte -5 3719 .byte -5 3720 .byte -5 3721 .byte -5 3722 .byte -5 3723 .byte -5 3724 .byte -5 3725 .byte -5 3726 .byte -5 3727 .byte -5 3728 .byte -5 3729 .byte -5 3730 .byte -6 3731 .byte -6 3732 .byte -6 3733 .byte -6 3734 .byte -6 3735 .byte -6 3736 .byte -6 3737 .byte -6 3738 .byte -6 3739 .byte -6 3740 .byte -6 3741 .byte -6 3742 .byte -6 3743 .byte -6 3744 .byte -6 3745 .byte -6 3746 .byte -6 3747 .byte -6 3748 .byte -6 3749 .byte -6 3750 .byte -6 3751 .byte -6 3752 .byte -6 3753 .byte -6 3754 .byte -6 3755 .byte -6 3756 .byte -6 3757 .byte -6 3758 .byte -6 3759 .byte -6 3760 .byte -6 3761 .byte -6 3762 .byte -6 3763 .byte -6 3764 .byte -6 3765 .byte -6 3766 .byte -6 3767 .byte -6 3768 .byte -6 3769 .byte -6 3770 .byte -6 3771 .byte -6 3772 .byte -6 3773 .byte -6 3774 .byte -6 3775 .byte -6 3776 .byte -6 3777 .byte -6 3778 .byte -6 3779 .byte -6 3780 .byte -6 3781 .byte -6 3782 .byte -6 3783 .byte -6 3784 .byte -6 3785 .byte -6 3786 .byte -6 3787 .byte -6 3788 .byte -6 3789 .byte -6 3790 .byte -6 3791 .byte -6 3792 .byte -6 3793/* Lookup table translating positive divisor to index into table of 3794 normalized inverse. N.B. the '0' entry is also the last entry of the 3795 previous table, and causes an unaligned access for division by zero. */ 3796LOCAL(div_table_ix): 3797 .byte -6 3798 .byte -128 3799 .byte -128 3800 .byte 0 3801 .byte -128 3802 .byte -64 3803 .byte 0 3804 .byte 64 3805 .byte -128 3806 .byte -96 3807 .byte -64 3808 .byte -32 3809 .byte 0 3810 .byte 32 3811 .byte 64 3812 .byte 96 3813 .byte -128 3814 .byte -112 3815 .byte -96 3816 .byte -80 3817 .byte -64 3818 .byte -48 3819 .byte -32 3820 .byte -16 3821 .byte 0 3822 .byte 16 3823 .byte 32 3824 .byte 48 3825 .byte 64 3826 .byte 80 3827 .byte 96 3828 .byte 112 3829 .byte -128 3830 .byte -120 3831 .byte -112 3832 .byte -104 3833 .byte -96 3834 .byte -88 3835 .byte -80 3836 .byte -72 3837 .byte -64 3838 .byte -56 3839 .byte -48 3840 .byte -40 3841 .byte -32 3842 .byte -24 3843 .byte -16 3844 .byte -8 3845 .byte 0 3846 .byte 8 3847 .byte 16 3848 .byte 24 3849 .byte 32 3850 .byte 40 3851 .byte 48 3852 .byte 56 3853 .byte 64 3854 .byte 72 3855 .byte 80 3856 .byte 88 3857 .byte 96 3858 .byte 104 3859 .byte 112 3860 .byte 120 3861 .byte -128 3862 .byte -124 3863 .byte -120 3864 .byte -116 3865 .byte -112 3866 .byte -108 3867 .byte -104 3868 .byte -100 3869 .byte -96 3870 .byte -92 3871 .byte -88 3872 .byte -84 3873 .byte -80 3874 .byte -76 3875 .byte -72 3876 .byte -68 3877 .byte -64 3878 .byte -60 3879 .byte -56 3880 .byte -52 3881 .byte -48 3882 .byte -44 3883 .byte -40 3884 .byte -36 3885 .byte -32 3886 .byte -28 3887 .byte -24 3888 .byte -20 3889 .byte -16 3890 .byte -12 3891 .byte -8 3892 .byte -4 3893 .byte 0 3894 .byte 4 3895 .byte 8 3896 .byte 12 3897 .byte 16 3898 .byte 20 3899 .byte 24 3900 .byte 28 3901 .byte 32 3902 .byte 36 3903 .byte 40 3904 .byte 44 3905 .byte 48 3906 .byte 52 3907 .byte 56 3908 .byte 60 3909 .byte 64 3910 .byte 68 3911 .byte 72 3912 .byte 76 3913 .byte 80 3914 .byte 84 3915 .byte 88 3916 .byte 92 3917 .byte 96 3918 .byte 100 3919 .byte 104 3920 .byte 108 3921 .byte 112 3922 .byte 116 3923 .byte 120 3924 .byte 124 3925 .byte -128 3926/* 1/64 .. 1/127, normalized. There is an implicit leading 1 in bit 32. */ 3927 .balign 4 3928LOCAL(zero_l): 3929 .long 0x0 3930 .long 0xF81F81F9 3931 .long 0xF07C1F08 3932 .long 0xE9131AC0 3933 .long 0xE1E1E1E2 3934 .long 0xDAE6076C 3935 .long 0xD41D41D5 3936 .long 0xCD856891 3937 .long 0xC71C71C8 3938 .long 0xC0E07039 3939 .long 0xBACF914D 3940 .long 0xB4E81B4F 3941 .long 0xAF286BCB 3942 .long 0xA98EF607 3943 .long 0xA41A41A5 3944 .long 0x9EC8E952 3945 .long 0x9999999A 3946 .long 0x948B0FCE 3947 .long 0x8F9C18FA 3948 .long 0x8ACB90F7 3949 .long 0x86186187 3950 .long 0x81818182 3951 .long 0x7D05F418 3952 .long 0x78A4C818 3953 .long 0x745D1746 3954 .long 0x702E05C1 3955 .long 0x6C16C16D 3956 .long 0x68168169 3957 .long 0x642C8591 3958 .long 0x60581606 3959 .long 0x5C9882BA 3960 .long 0x58ED2309 3961LOCAL(div_table_inv): 3962 .long 0x55555556 3963 .long 0x51D07EAF 3964 .long 0x4E5E0A73 3965 .long 0x4AFD6A06 3966 .long 0x47AE147B 3967 .long 0x446F8657 3968 .long 0x41414142 3969 .long 0x3E22CBCF 3970 .long 0x3B13B13C 3971 .long 0x38138139 3972 .long 0x3521CFB3 3973 .long 0x323E34A3 3974 .long 0x2F684BDB 3975 .long 0x2C9FB4D9 3976 .long 0x29E4129F 3977 .long 0x27350B89 3978 .long 0x24924925 3979 .long 0x21FB7813 3980 .long 0x1F7047DD 3981 .long 0x1CF06ADB 3982 .long 0x1A7B9612 3983 .long 0x18118119 3984 .long 0x15B1E5F8 3985 .long 0x135C8114 3986 .long 0x11111112 3987 .long 0xECF56BF 3988 .long 0xC9714FC 3989 .long 0xA6810A7 3990 .long 0x8421085 3991 .long 0x624DD30 3992 .long 0x4104105 3993 .long 0x2040811 3994 /* maximum error: 0.987342 scaled: 0.921875*/ 3995 3996 ENDFUNC(GLOBAL(sdivsi3_i4i)) 3997#endif /* SH3 / SH4 */ 3998 3999#endif /* L_div_table */ 4000 4001#ifdef L_udiv_qrnnd_16 4002#if !__SHMEDIA__ 4003 HIDDEN_FUNC(GLOBAL(udiv_qrnnd_16)) 4004 /* r0: rn r1: qn */ /* r0: n1 r4: n0 r5: d r6: d1 */ /* r2: __m */ 4005 /* n1 < d, but n1 might be larger than d1. */ 4006 .global GLOBAL(udiv_qrnnd_16) 4007 .balign 8 4008GLOBAL(udiv_qrnnd_16): 4009 div0u 4010 cmp/hi r6,r0 4011 bt .Lots 4012 .rept 16 4013 div1 r6,r0 4014 .endr 4015 extu.w r0,r1 4016 bt 0f 4017 add r6,r0 40180: rotcl r1 4019 mulu.w r1,r5 4020 xtrct r4,r0 4021 swap.w r0,r0 4022 sts macl,r2 4023 cmp/hs r2,r0 4024 sub r2,r0 4025 bt 0f 4026 addc r5,r0 4027 add #-1,r1 4028 bt 0f 40291: add #-1,r1 4030 rts 4031 add r5,r0 4032 .balign 8 4033.Lots: 4034 sub r5,r0 4035 swap.w r4,r1 4036 xtrct r0,r1 4037 clrt 4038 mov r1,r0 4039 addc r5,r0 4040 mov #-1,r1 4041 SL1(bf, 1b, 4042 shlr16 r1) 40430: rts 4044 nop 4045 ENDFUNC(GLOBAL(udiv_qrnnd_16)) 4046#endif /* !__SHMEDIA__ */ 4047#endif /* L_udiv_qrnnd_16 */ 4048