1/* -*- Mode: Asm -*- */ 2;; Copyright (C) 2012-2013 Free Software Foundation, Inc. 3;; Contributed by Sean D'Epagnier (sean@depagnier.com) 4;; Georg-Johann Lay (avr@gjlay.de) 5 6;; This file is free software; you can redistribute it and/or modify it 7;; under the terms of the GNU General Public License as published by the 8;; Free Software Foundation; either version 3, or (at your option) any 9;; later version. 10 11;; In addition to the permissions in the GNU General Public License, the 12;; Free Software Foundation gives you unlimited permission to link the 13;; compiled version of this file into combinations with other programs, 14;; and to distribute those combinations without any restriction coming 15;; from the use of this file. (The General Public License restrictions 16;; do apply in other respects; for example, they cover modification of 17;; the file, and distribution when not linked into a combine 18;; executable.) 19 20;; This file is distributed in the hope that it will be useful, but 21;; WITHOUT ANY WARRANTY; without even the implied warranty of 22;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 23;; General Public License for more details. 24 25;; You should have received a copy of the GNU General Public License 26;; along with this program; see the file COPYING. If not, write to 27;; the Free Software Foundation, 51 Franklin Street, Fifth Floor, 28;; Boston, MA 02110-1301, USA. 29 30;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 31;; Fixed point library routines for AVR 32;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 33 34.section .text.libgcc.fixed, "ax", @progbits 35 36;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 37;; Conversions to float 38;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 39 40#if defined (L_fractqqsf) 41DEFUN __fractqqsf 42 ;; Move in place for SA -> SF conversion 43 clr r22 44 mov r23, r24 45 ;; Sign-extend 46 lsl r24 47 sbc r24, r24 48 mov r25, r24 49 XJMP __fractsasf 50ENDF __fractqqsf 51#endif /* L_fractqqsf */ 52 53#if defined (L_fractuqqsf) 54DEFUN __fractuqqsf 55 ;; Move in place for USA -> SF conversion 56 clr r22 57 mov r23, r24 58 ;; Zero-extend 59 clr r24 60 clr r25 61 XJMP __fractusasf 62ENDF __fractuqqsf 63#endif /* L_fractuqqsf */ 64 65#if defined (L_fracthqsf) 66DEFUN __fracthqsf 67 ;; Move in place for SA -> SF conversion 68 wmov 22, 24 69 ;; Sign-extend 70 lsl r25 71 sbc r24, r24 72 mov r25, r24 73 XJMP __fractsasf 74ENDF __fracthqsf 75#endif /* L_fracthqsf */ 76 77#if defined (L_fractuhqsf) 78DEFUN __fractuhqsf 79 ;; Move in place for USA -> SF conversion 80 wmov 22, 24 81 ;; Zero-extend 82 clr r24 83 clr r25 84 XJMP __fractusasf 85ENDF __fractuhqsf 86#endif /* L_fractuhqsf */ 87 88#if defined (L_fracthasf) 89DEFUN __fracthasf 90 ;; Move in place for SA -> SF conversion 91 clr r22 92 mov r23, r24 93 mov r24, r25 94 ;; Sign-extend 95 lsl r25 96 sbc r25, r25 97 XJMP __fractsasf 98ENDF __fracthasf 99#endif /* L_fracthasf */ 100 101#if defined (L_fractuhasf) 102DEFUN __fractuhasf 103 ;; Move in place for USA -> SF conversion 104 clr r22 105 mov r23, r24 106 mov r24, r25 107 ;; Zero-extend 108 clr r25 109 XJMP __fractusasf 110ENDF __fractuhasf 111#endif /* L_fractuhasf */ 112 113 114#if defined (L_fractsqsf) 115DEFUN __fractsqsf 116 XCALL __floatsisf 117 ;; Divide non-zero results by 2^31 to move the 118 ;; decimal point into place 119 tst r25 120 breq 0f 121 subi r24, exp_lo (31) 122 sbci r25, exp_hi (31) 1230: ret 124ENDF __fractsqsf 125#endif /* L_fractsqsf */ 126 127#if defined (L_fractusqsf) 128DEFUN __fractusqsf 129 XCALL __floatunsisf 130 ;; Divide non-zero results by 2^32 to move the 131 ;; decimal point into place 132 cpse r25, __zero_reg__ 133 subi r25, exp_hi (32) 134 ret 135ENDF __fractusqsf 136#endif /* L_fractusqsf */ 137 138#if defined (L_fractsasf) 139DEFUN __fractsasf 140 XCALL __floatsisf 141 ;; Divide non-zero results by 2^15 to move the 142 ;; decimal point into place 143 tst r25 144 breq 0f 145 subi r24, exp_lo (15) 146 sbci r25, exp_hi (15) 1470: ret 148ENDF __fractsasf 149#endif /* L_fractsasf */ 150 151#if defined (L_fractusasf) 152DEFUN __fractusasf 153 XCALL __floatunsisf 154 ;; Divide non-zero results by 2^16 to move the 155 ;; decimal point into place 156 cpse r25, __zero_reg__ 157 subi r25, exp_hi (16) 158 ret 159ENDF __fractusasf 160#endif /* L_fractusasf */ 161 162;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 163;; Conversions from float 164;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 165 166#if defined (L_fractsfqq) 167DEFUN __fractsfqq 168 ;; Multiply with 2^{24+7} to get a QQ result in r25 169 subi r24, exp_lo (-31) 170 sbci r25, exp_hi (-31) 171 XCALL __fixsfsi 172 mov r24, r25 173 ret 174ENDF __fractsfqq 175#endif /* L_fractsfqq */ 176 177#if defined (L_fractsfuqq) 178DEFUN __fractsfuqq 179 ;; Multiply with 2^{24+8} to get a UQQ result in r25 180 subi r25, exp_hi (-32) 181 XCALL __fixunssfsi 182 mov r24, r25 183 ret 184ENDF __fractsfuqq 185#endif /* L_fractsfuqq */ 186 187#if defined (L_fractsfha) 188DEFUN __fractsfha 189 ;; Multiply with 2^{16+7} to get a HA result in r25:r24 190 subi r24, exp_lo (-23) 191 sbci r25, exp_hi (-23) 192 XJMP __fixsfsi 193ENDF __fractsfha 194#endif /* L_fractsfha */ 195 196#if defined (L_fractsfuha) 197DEFUN __fractsfuha 198 ;; Multiply with 2^24 to get a UHA result in r25:r24 199 subi r25, exp_hi (-24) 200 XJMP __fixunssfsi 201ENDF __fractsfuha 202#endif /* L_fractsfuha */ 203 204#if defined (L_fractsfhq) 205FALIAS __fractsfsq 206 207DEFUN __fractsfhq 208 ;; Multiply with 2^{16+15} to get a HQ result in r25:r24 209 ;; resp. with 2^31 to get a SQ result in r25:r22 210 subi r24, exp_lo (-31) 211 sbci r25, exp_hi (-31) 212 XJMP __fixsfsi 213ENDF __fractsfhq 214#endif /* L_fractsfhq */ 215 216#if defined (L_fractsfuhq) 217FALIAS __fractsfusq 218 219DEFUN __fractsfuhq 220 ;; Multiply with 2^{16+16} to get a UHQ result in r25:r24 221 ;; resp. with 2^32 to get a USQ result in r25:r22 222 subi r25, exp_hi (-32) 223 XJMP __fixunssfsi 224ENDF __fractsfuhq 225#endif /* L_fractsfuhq */ 226 227#if defined (L_fractsfsa) 228DEFUN __fractsfsa 229 ;; Multiply with 2^15 to get a SA result in r25:r22 230 subi r24, exp_lo (-15) 231 sbci r25, exp_hi (-15) 232 XJMP __fixsfsi 233ENDF __fractsfsa 234#endif /* L_fractsfsa */ 235 236#if defined (L_fractsfusa) 237DEFUN __fractsfusa 238 ;; Multiply with 2^16 to get a USA result in r25:r22 239 subi r25, exp_hi (-16) 240 XJMP __fixunssfsi 241ENDF __fractsfusa 242#endif /* L_fractsfusa */ 243 244 245;; For multiplication the functions here are called directly from 246;; avr-fixed.md instead of using the standard libcall mechanisms. 247;; This can make better code because GCC knows exactly which 248;; of the call-used registers (not all of them) are clobbered. */ 249 250/******************************************************* 251 Fractional Multiplication 8 x 8 without MUL 252*******************************************************/ 253 254#if defined (L_mulqq3) && !defined (__AVR_HAVE_MUL__) 255;;; R23 = R24 * R25 256;;; Clobbers: __tmp_reg__, R22, R24, R25 257;;; Rounding: ??? 258DEFUN __mulqq3 259 XCALL __fmuls 260 ;; TR 18037 requires that (-1) * (-1) does not overflow 261 ;; The only input that can produce -1 is (-1)^2. 262 dec r23 263 brvs 0f 264 inc r23 2650: ret 266ENDF __mulqq3 267#endif /* L_mulqq3 && ! HAVE_MUL */ 268 269/******************************************************* 270 Fractional Multiply .16 x .16 with and without MUL 271*******************************************************/ 272 273#if defined (L_mulhq3) 274;;; Same code with and without MUL, but the interfaces differ: 275;;; no MUL: (R25:R24) = (R22:R23) * (R24:R25) 276;;; Clobbers: ABI, called by optabs 277;;; MUL: (R25:R24) = (R19:R18) * (R27:R26) 278;;; Clobbers: __tmp_reg__, R22, R23 279;;; Rounding: -0.5 LSB <= error <= 0.5 LSB 280DEFUN __mulhq3 281 XCALL __mulhisi3 282 ;; Shift result into place 283 lsl r23 284 rol r24 285 rol r25 286 brvs 1f 287 ;; Round 288 sbrc r23, 7 289 adiw r24, 1 290 ret 2911: ;; Overflow. TR 18037 requires (-1)^2 not to overflow 292 ldi r24, lo8 (0x7fff) 293 ldi r25, hi8 (0x7fff) 294 ret 295ENDF __mulhq3 296#endif /* defined (L_mulhq3) */ 297 298#if defined (L_muluhq3) 299;;; Same code with and without MUL, but the interfaces differ: 300;;; no MUL: (R25:R24) *= (R23:R22) 301;;; Clobbers: ABI, called by optabs 302;;; MUL: (R25:R24) = (R19:R18) * (R27:R26) 303;;; Clobbers: __tmp_reg__, R22, R23 304;;; Rounding: -0.5 LSB < error <= 0.5 LSB 305DEFUN __muluhq3 306 XCALL __umulhisi3 307 ;; Round 308 sbrc r23, 7 309 adiw r24, 1 310 ret 311ENDF __muluhq3 312#endif /* L_muluhq3 */ 313 314 315/******************************************************* 316 Fixed Multiply 8.8 x 8.8 with and without MUL 317*******************************************************/ 318 319#if defined (L_mulha3) 320;;; Same code with and without MUL, but the interfaces differ: 321;;; no MUL: (R25:R24) = (R22:R23) * (R24:R25) 322;;; Clobbers: ABI, called by optabs 323;;; MUL: (R25:R24) = (R19:R18) * (R27:R26) 324;;; Clobbers: __tmp_reg__, R22, R23 325;;; Rounding: -0.5 LSB <= error <= 0.5 LSB 326DEFUN __mulha3 327 XCALL __mulhisi3 328 lsl r22 329 rol r23 330 rol r24 331 XJMP __muluha3_round 332ENDF __mulha3 333#endif /* L_mulha3 */ 334 335#if defined (L_muluha3) 336;;; Same code with and without MUL, but the interfaces differ: 337;;; no MUL: (R25:R24) *= (R23:R22) 338;;; Clobbers: ABI, called by optabs 339;;; MUL: (R25:R24) = (R19:R18) * (R27:R26) 340;;; Clobbers: __tmp_reg__, R22, R23 341;;; Rounding: -0.5 LSB < error <= 0.5 LSB 342DEFUN __muluha3 343 XCALL __umulhisi3 344 XJMP __muluha3_round 345ENDF __muluha3 346#endif /* L_muluha3 */ 347 348#if defined (L_muluha3_round) 349DEFUN __muluha3_round 350 ;; Shift result into place 351 mov r25, r24 352 mov r24, r23 353 ;; Round 354 sbrc r22, 7 355 adiw r24, 1 356 ret 357ENDF __muluha3_round 358#endif /* L_muluha3_round */ 359 360 361/******************************************************* 362 Fixed Multiplication 16.16 x 16.16 363*******************************************************/ 364 365;; Bits outside the result (below LSB), used in the signed version 366#define GUARD __tmp_reg__ 367 368#if defined (__AVR_HAVE_MUL__) 369 370;; Multiplier 371#define A0 16 372#define A1 A0+1 373#define A2 A1+1 374#define A3 A2+1 375 376;; Multiplicand 377#define B0 20 378#define B1 B0+1 379#define B2 B1+1 380#define B3 B2+1 381 382;; Result 383#define C0 24 384#define C1 C0+1 385#define C2 C1+1 386#define C3 C2+1 387 388#if defined (L_mulusa3) 389;;; (C3:C0) = (A3:A0) * (B3:B0) 390DEFUN __mulusa3 391 set 392 ;; Fallthru 393ENDF __mulusa3 394 395;;; Round for last digit iff T = 1 396;;; Return guard bits in GUARD (__tmp_reg__). 397;;; Rounding, T = 0: -1.0 LSB < error <= 0 LSB 398;;; Rounding, T = 1: -0.5 LSB < error <= 0.5 LSB 399DEFUN __mulusa3_round 400 ;; Some of the MUL instructions have LSBs outside the result. 401 ;; Don't ignore these LSBs in order to tame rounding error. 402 ;; Use C2/C3 for these LSBs. 403 404 clr C0 405 clr C1 406 mul A0, B0 $ movw C2, r0 407 408 mul A1, B0 $ add C3, r0 $ adc C0, r1 409 mul A0, B1 $ add C3, r0 $ adc C0, r1 $ rol C1 410 411 ;; Round if T = 1. Store guarding bits outside the result for rounding 412 ;; and left-shift by the signed version (function below). 413 brtc 0f 414 sbrc C3, 7 415 adiw C0, 1 4160: push C3 417 418 ;; The following MULs don't have LSBs outside the result. 419 ;; C2/C3 is the high part. 420 421 mul A0, B2 $ add C0, r0 $ adc C1, r1 $ sbc C2, C2 422 mul A1, B1 $ add C0, r0 $ adc C1, r1 $ sbci C2, 0 423 mul A2, B0 $ add C0, r0 $ adc C1, r1 $ sbci C2, 0 424 neg C2 425 426 mul A0, B3 $ add C1, r0 $ adc C2, r1 $ sbc C3, C3 427 mul A1, B2 $ add C1, r0 $ adc C2, r1 $ sbci C3, 0 428 mul A2, B1 $ add C1, r0 $ adc C2, r1 $ sbci C3, 0 429 mul A3, B0 $ add C1, r0 $ adc C2, r1 $ sbci C3, 0 430 neg C3 431 432 mul A1, B3 $ add C2, r0 $ adc C3, r1 433 mul A2, B2 $ add C2, r0 $ adc C3, r1 434 mul A3, B1 $ add C2, r0 $ adc C3, r1 435 436 mul A2, B3 $ add C3, r0 437 mul A3, B2 $ add C3, r0 438 439 ;; Guard bits used in the signed version below. 440 pop GUARD 441 clr __zero_reg__ 442 ret 443ENDF __mulusa3_round 444#endif /* L_mulusa3 */ 445 446#if defined (L_mulsa3) 447;;; (C3:C0) = (A3:A0) * (B3:B0) 448;;; Clobbers: __tmp_reg__, T 449;;; Rounding: -0.5 LSB <= error <= 0.5 LSB 450DEFUN __mulsa3 451 clt 452 XCALL __mulusa3_round 453 ;; A posteriori sign extension of the operands 454 tst B3 455 brpl 1f 456 sub C2, A0 457 sbc C3, A1 4581: sbrs A3, 7 459 rjmp 2f 460 sub C2, B0 461 sbc C3, B1 4622: 463 ;; Shift 1 bit left to adjust for 15 fractional bits 464 lsl GUARD 465 rol C0 466 rol C1 467 rol C2 468 rol C3 469 ;; Round last digit 470 lsl GUARD 471 adc C0, __zero_reg__ 472 adc C1, __zero_reg__ 473 adc C2, __zero_reg__ 474 adc C3, __zero_reg__ 475 ret 476ENDF __mulsa3 477#endif /* L_mulsa3 */ 478 479#undef A0 480#undef A1 481#undef A2 482#undef A3 483#undef B0 484#undef B1 485#undef B2 486#undef B3 487#undef C0 488#undef C1 489#undef C2 490#undef C3 491 492#else /* __AVR_HAVE_MUL__ */ 493 494#define A0 18 495#define A1 A0+1 496#define A2 A0+2 497#define A3 A0+3 498 499#define B0 22 500#define B1 B0+1 501#define B2 B0+2 502#define B3 B0+3 503 504#define C0 22 505#define C1 C0+1 506#define C2 C0+2 507#define C3 C0+3 508 509;; __tmp_reg__ 510#define CC0 0 511;; __zero_reg__ 512#define CC1 1 513#define CC2 16 514#define CC3 17 515 516#define AA0 26 517#define AA1 AA0+1 518#define AA2 30 519#define AA3 AA2+1 520 521#if defined (L_mulsa3) 522;;; (R25:R22) *= (R21:R18) 523;;; Clobbers: ABI, called by optabs 524;;; Rounding: -1 LSB <= error <= 1 LSB 525DEFUN __mulsa3 526 push B0 527 push B1 528 push B3 529 clt 530 XCALL __mulusa3_round 531 pop r30 532 ;; sign-extend B 533 bst r30, 7 534 brtc 1f 535 ;; A1, A0 survived in R27:R26 536 sub C2, AA0 537 sbc C3, AA1 5381: 539 pop AA1 ;; B1 540 pop AA0 ;; B0 541 542 ;; sign-extend A. A3 survived in R31 543 bst AA3, 7 544 brtc 2f 545 sub C2, AA0 546 sbc C3, AA1 5472: 548 ;; Shift 1 bit left to adjust for 15 fractional bits 549 lsl GUARD 550 rol C0 551 rol C1 552 rol C2 553 rol C3 554 ;; Round last digit 555 lsl GUARD 556 adc C0, __zero_reg__ 557 adc C1, __zero_reg__ 558 adc C2, __zero_reg__ 559 adc C3, __zero_reg__ 560 ret 561ENDF __mulsa3 562#endif /* L_mulsa3 */ 563 564#if defined (L_mulusa3) 565;;; (R25:R22) *= (R21:R18) 566;;; Clobbers: ABI, called by optabs 567;;; Rounding: -1 LSB <= error <= 1 LSB 568DEFUN __mulusa3 569 set 570 ;; Fallthru 571ENDF __mulusa3 572 573;;; A[] survives in 26, 27, 30, 31 574;;; Also used by __mulsa3 with T = 0 575;;; Round if T = 1 576;;; Return Guard bits in GUARD (__tmp_reg__), used by signed version. 577DEFUN __mulusa3_round 578 push CC2 579 push CC3 580 ; clear result 581 clr __tmp_reg__ 582 wmov CC2, CC0 583 ; save multiplicand 584 wmov AA0, A0 585 wmov AA2, A2 586 rjmp 3f 587 588 ;; Loop the integral part 589 5901: ;; CC += A * 2^n; n >= 0 591 add CC0,A0 $ adc CC1,A1 $ adc CC2,A2 $ adc CC3,A3 592 5932: ;; A <<= 1 594 lsl A0 $ rol A1 $ rol A2 $ rol A3 595 5963: ;; IBIT(B) >>= 1 597 ;; Carry = n-th bit of B; n >= 0 598 lsr B3 599 ror B2 600 brcs 1b 601 sbci B3, 0 602 brne 2b 603 604 ;; Loop the fractional part 605 ;; B2/B3 is 0 now, use as guard bits for rounding 606 ;; Restore multiplicand 607 wmov A0, AA0 608 wmov A2, AA2 609 rjmp 5f 610 6114: ;; CC += A:Guard * 2^n; n < 0 612 add B3,B2 $ adc CC0,A0 $ adc CC1,A1 $ adc CC2,A2 $ adc CC3,A3 6135: 614 ;; A:Guard >>= 1 615 lsr A3 $ ror A2 $ ror A1 $ ror A0 $ ror B2 616 617 ;; FBIT(B) <<= 1 618 ;; Carry = n-th bit of B; n < 0 619 lsl B0 620 rol B1 621 brcs 4b 622 sbci B0, 0 623 brne 5b 624 625 ;; Save guard bits and set carry for rounding 626 push B3 627 lsl B3 628 ;; Move result into place 629 wmov C2, CC2 630 wmov C0, CC0 631 clr __zero_reg__ 632 brtc 6f 633 ;; Round iff T = 1 634 adc C0, __zero_reg__ 635 adc C1, __zero_reg__ 636 adc C2, __zero_reg__ 637 adc C3, __zero_reg__ 6386: 639 pop GUARD 640 ;; Epilogue 641 pop CC3 642 pop CC2 643 ret 644ENDF __mulusa3_round 645#endif /* L_mulusa3 */ 646 647#undef A0 648#undef A1 649#undef A2 650#undef A3 651#undef B0 652#undef B1 653#undef B2 654#undef B3 655#undef C0 656#undef C1 657#undef C2 658#undef C3 659#undef AA0 660#undef AA1 661#undef AA2 662#undef AA3 663#undef CC0 664#undef CC1 665#undef CC2 666#undef CC3 667 668#endif /* __AVR_HAVE_MUL__ */ 669 670#undef GUARD 671 672/*********************************************************** 673 Fixed unsigned saturated Multiplication 8.8 x 8.8 674***********************************************************/ 675 676#define C0 22 677#define C1 C0+1 678#define C2 C0+2 679#define C3 C0+3 680#define SS __tmp_reg__ 681 682#if defined (L_usmuluha3) 683DEFUN __usmuluha3 684 ;; Widening multiply 685#ifdef __AVR_HAVE_MUL__ 686 ;; Adjust interface 687 movw R26, R22 688 movw R18, R24 689#endif /* HAVE MUL */ 690 XCALL __umulhisi3 691 tst C3 692 brne .Lmax 693 ;; Round, target is in C1..C2 694 lsl C0 695 adc C1, __zero_reg__ 696 adc C2, __zero_reg__ 697 brcs .Lmax 698 ;; Move result into place 699 mov C3, C2 700 mov C2, C1 701 ret 702.Lmax: 703 ;; Saturate 704 ldi C2, 0xff 705 ldi C3, 0xff 706 ret 707ENDF __usmuluha3 708#endif /* L_usmuluha3 */ 709 710/*********************************************************** 711 Fixed signed saturated Multiplication s8.7 x s8.7 712***********************************************************/ 713 714#if defined (L_ssmulha3) 715DEFUN __ssmulha3 716 ;; Widening multiply 717#ifdef __AVR_HAVE_MUL__ 718 ;; Adjust interface 719 movw R26, R22 720 movw R18, R24 721#endif /* HAVE MUL */ 722 XCALL __mulhisi3 723 ;; Adjust decimal point 724 lsl C0 725 rol C1 726 rol C2 727 brvs .LsatC3.3 728 ;; The 9 MSBs must be the same 729 rol C3 730 sbc SS, SS 731 cp C3, SS 732 brne .LsatSS 733 ;; Round 734 lsl C0 735 adc C1, __zero_reg__ 736 adc C2, __zero_reg__ 737 brvs .Lmax 738 ;; Move result into place 739 mov C3, C2 740 mov C2, C1 741 ret 742.Lmax: 743 ;; Load 0x7fff 744 clr C3 745.LsatC3.3: 746 ;; C3 < 0 --> 0x8000 747 ;; C3 >= 0 --> 0x7fff 748 mov SS, C3 749.LsatSS: 750 ;; Load min / max value: 751 ;; SS = -1 --> 0x8000 752 ;; SS = 0 --> 0x7fff 753 ldi C3, 0x7f 754 ldi C2, 0xff 755 sbrc SS, 7 756 adiw C2, 1 757 ret 758ENDF __ssmulha3 759#endif /* L_ssmulha3 */ 760 761#undef C0 762#undef C1 763#undef C2 764#undef C3 765#undef SS 766 767/*********************************************************** 768 Fixed unsigned saturated Multiplication 16.16 x 16.16 769***********************************************************/ 770 771#define C0 18 772#define C1 C0+1 773#define C2 C0+2 774#define C3 C0+3 775#define C4 C0+4 776#define C5 C0+5 777#define C6 C0+6 778#define C7 C0+7 779#define SS __tmp_reg__ 780 781#if defined (L_usmulusa3) 782;; R22[4] = R22[4] *{ssat} R18[4] 783;; Ordinary ABI function 784DEFUN __usmulusa3 785 ;; Widening multiply 786 XCALL __umulsidi3 787 or C7, C6 788 brne .Lmax 789 ;; Round, target is in C2..C5 790 lsl C1 791 adc C2, __zero_reg__ 792 adc C3, __zero_reg__ 793 adc C4, __zero_reg__ 794 adc C5, __zero_reg__ 795 brcs .Lmax 796 ;; Move result into place 797 wmov C6, C4 798 wmov C4, C2 799 ret 800.Lmax: 801 ;; Saturate 802 ldi C7, 0xff 803 ldi C6, 0xff 804 wmov C4, C6 805 ret 806ENDF __usmulusa3 807#endif /* L_usmulusa3 */ 808 809/*********************************************************** 810 Fixed signed saturated Multiplication s16.15 x s16.15 811***********************************************************/ 812 813#if defined (L_ssmulsa3) 814;; R22[4] = R22[4] *{ssat} R18[4] 815;; Ordinary ABI function 816DEFUN __ssmulsa3 817 ;; Widening multiply 818 XCALL __mulsidi3 819 ;; Adjust decimal point 820 lsl C1 821 rol C2 822 rol C3 823 rol C4 824 rol C5 825 brvs .LsatC7.7 826 ;; The 17 MSBs must be the same 827 rol C6 828 rol C7 829 sbc SS, SS 830 cp C6, SS 831 cpc C7, SS 832 brne .LsatSS 833 ;; Round 834 lsl C1 835 adc C2, __zero_reg__ 836 adc C3, __zero_reg__ 837 adc C4, __zero_reg__ 838 adc C5, __zero_reg__ 839 brvs .Lmax 840 ;; Move result into place 841 wmov C6, C4 842 wmov C4, C2 843 ret 844 845.Lmax: 846 ;; Load 0x7fffffff 847 clr C7 848.LsatC7.7: 849 ;; C7 < 0 --> 0x80000000 850 ;; C7 >= 0 --> 0x7fffffff 851 lsl C7 852 sbc SS, SS 853.LsatSS: 854 ;; Load min / max value: 855 ;; SS = -1 --> 0x80000000 856 ;; SS = 0 --> 0x7fffffff 857 com SS 858 mov C4, SS 859 mov C5, C4 860 wmov C6, C4 861 subi C7, 0x80 862 ret 863ENDF __ssmulsa3 864#endif /* L_ssmulsa3 */ 865 866#undef C0 867#undef C1 868#undef C2 869#undef C3 870#undef C4 871#undef C5 872#undef C6 873#undef C7 874#undef SS 875 876/******************************************************* 877 Fractional Division 8 / 8 878*******************************************************/ 879 880#define r_divd r25 /* dividend */ 881#define r_quo r24 /* quotient */ 882#define r_div r22 /* divisor */ 883#define r_sign __tmp_reg__ 884 885#if defined (L_divqq3) 886DEFUN __divqq3 887 mov r_sign, r_divd 888 eor r_sign, r_div 889 sbrc r_div, 7 890 neg r_div 891 sbrc r_divd, 7 892 neg r_divd 893 XCALL __divqq_helper 894 lsr r_quo 895 sbrc r_sign, 7 ; negate result if needed 896 neg r_quo 897 ret 898ENDF __divqq3 899#endif /* L_divqq3 */ 900 901#if defined (L_udivuqq3) 902DEFUN __udivuqq3 903 cp r_divd, r_div 904 brsh 0f 905 XJMP __divqq_helper 906 ;; Result is out of [0, 1) ==> Return 1 - eps. 9070: ldi r_quo, 0xff 908 ret 909ENDF __udivuqq3 910#endif /* L_udivuqq3 */ 911 912 913#if defined (L_divqq_helper) 914DEFUN __divqq_helper 915 clr r_quo ; clear quotient 916 inc __zero_reg__ ; init loop counter, used per shift 917__udivuqq3_loop: 918 lsl r_divd ; shift dividend 919 brcs 0f ; dividend overflow 920 cp r_divd,r_div ; compare dividend & divisor 921 brcc 0f ; dividend >= divisor 922 rol r_quo ; shift quotient (with CARRY) 923 rjmp __udivuqq3_cont 9240: 925 sub r_divd,r_div ; restore dividend 926 lsl r_quo ; shift quotient (without CARRY) 927__udivuqq3_cont: 928 lsl __zero_reg__ ; shift loop-counter bit 929 brne __udivuqq3_loop 930 com r_quo ; complement result 931 ; because C flag was complemented in loop 932 ret 933ENDF __divqq_helper 934#endif /* L_divqq_helper */ 935 936#undef r_divd 937#undef r_quo 938#undef r_div 939#undef r_sign 940 941 942/******************************************************* 943 Fractional Division 16 / 16 944*******************************************************/ 945#define r_divdL 26 /* dividend Low */ 946#define r_divdH 27 /* dividend Hig */ 947#define r_quoL 24 /* quotient Low */ 948#define r_quoH 25 /* quotient High */ 949#define r_divL 22 /* divisor */ 950#define r_divH 23 /* divisor */ 951#define r_cnt 21 952 953#if defined (L_divhq3) 954DEFUN __divhq3 955 mov r0, r_divdH 956 eor r0, r_divH 957 sbrs r_divH, 7 958 rjmp 1f 959 NEG2 r_divL 9601: 961 sbrs r_divdH, 7 962 rjmp 2f 963 NEG2 r_divdL 9642: 965 cp r_divdL, r_divL 966 cpc r_divdH, r_divH 967 breq __divhq3_minus1 ; if equal return -1 968 XCALL __udivuhq3 969 lsr r_quoH 970 ror r_quoL 971 brpl 9f 972 ;; negate result if needed 973 NEG2 r_quoL 9749: 975 ret 976__divhq3_minus1: 977 ldi r_quoH, 0x80 978 clr r_quoL 979 ret 980ENDF __divhq3 981#endif /* defined (L_divhq3) */ 982 983#if defined (L_udivuhq3) 984DEFUN __udivuhq3 985 sub r_quoH,r_quoH ; clear quotient and carry 986 ;; FALLTHRU 987ENDF __udivuhq3 988 989DEFUN __udivuha3_common 990 clr r_quoL ; clear quotient 991 ldi r_cnt,16 ; init loop counter 992__udivuhq3_loop: 993 rol r_divdL ; shift dividend (with CARRY) 994 rol r_divdH 995 brcs __udivuhq3_ep ; dividend overflow 996 cp r_divdL,r_divL ; compare dividend & divisor 997 cpc r_divdH,r_divH 998 brcc __udivuhq3_ep ; dividend >= divisor 999 rol r_quoL ; shift quotient (with CARRY) 1000 rjmp __udivuhq3_cont 1001__udivuhq3_ep: 1002 sub r_divdL,r_divL ; restore dividend 1003 sbc r_divdH,r_divH 1004 lsl r_quoL ; shift quotient (without CARRY) 1005__udivuhq3_cont: 1006 rol r_quoH ; shift quotient 1007 dec r_cnt ; decrement loop counter 1008 brne __udivuhq3_loop 1009 com r_quoL ; complement result 1010 com r_quoH ; because C flag was complemented in loop 1011 ret 1012ENDF __udivuha3_common 1013#endif /* defined (L_udivuhq3) */ 1014 1015/******************************************************* 1016 Fixed Division 8.8 / 8.8 1017*******************************************************/ 1018#if defined (L_divha3) 1019DEFUN __divha3 1020 mov r0, r_divdH 1021 eor r0, r_divH 1022 sbrs r_divH, 7 1023 rjmp 1f 1024 NEG2 r_divL 10251: 1026 sbrs r_divdH, 7 1027 rjmp 2f 1028 NEG2 r_divdL 10292: 1030 XCALL __udivuha3 1031 lsr r_quoH ; adjust to 7 fractional bits 1032 ror r_quoL 1033 sbrs r0, 7 ; negate result if needed 1034 ret 1035 NEG2 r_quoL 1036 ret 1037ENDF __divha3 1038#endif /* defined (L_divha3) */ 1039 1040#if defined (L_udivuha3) 1041DEFUN __udivuha3 1042 mov r_quoH, r_divdL 1043 mov r_divdL, r_divdH 1044 clr r_divdH 1045 lsl r_quoH ; shift quotient into carry 1046 XJMP __udivuha3_common ; same as fractional after rearrange 1047ENDF __udivuha3 1048#endif /* defined (L_udivuha3) */ 1049 1050#undef r_divdL 1051#undef r_divdH 1052#undef r_quoL 1053#undef r_quoH 1054#undef r_divL 1055#undef r_divH 1056#undef r_cnt 1057 1058/******************************************************* 1059 Fixed Division 16.16 / 16.16 1060*******************************************************/ 1061 1062#define r_arg1L 24 /* arg1 gets passed already in place */ 1063#define r_arg1H 25 1064#define r_arg1HL 26 1065#define r_arg1HH 27 1066#define r_divdL 26 /* dividend Low */ 1067#define r_divdH 27 1068#define r_divdHL 30 1069#define r_divdHH 31 /* dividend High */ 1070#define r_quoL 22 /* quotient Low */ 1071#define r_quoH 23 1072#define r_quoHL 24 1073#define r_quoHH 25 /* quotient High */ 1074#define r_divL 18 /* divisor Low */ 1075#define r_divH 19 1076#define r_divHL 20 1077#define r_divHH 21 /* divisor High */ 1078#define r_cnt __zero_reg__ /* loop count (0 after the loop!) */ 1079 1080#if defined (L_divsa3) 1081DEFUN __divsa3 1082 mov r0, r_arg1HH 1083 eor r0, r_divHH 1084 sbrs r_divHH, 7 1085 rjmp 1f 1086 NEG4 r_divL 10871: 1088 sbrs r_arg1HH, 7 1089 rjmp 2f 1090 NEG4 r_arg1L 10912: 1092 XCALL __udivusa3 1093 lsr r_quoHH ; adjust to 15 fractional bits 1094 ror r_quoHL 1095 ror r_quoH 1096 ror r_quoL 1097 sbrs r0, 7 ; negate result if needed 1098 ret 1099 ;; negate r_quoL 1100 XJMP __negsi2 1101ENDF __divsa3 1102#endif /* defined (L_divsa3) */ 1103 1104#if defined (L_udivusa3) 1105DEFUN __udivusa3 1106 ldi r_divdHL, 32 ; init loop counter 1107 mov r_cnt, r_divdHL 1108 clr r_divdHL 1109 clr r_divdHH 1110 wmov r_quoL, r_divdHL 1111 lsl r_quoHL ; shift quotient into carry 1112 rol r_quoHH 1113__udivusa3_loop: 1114 rol r_divdL ; shift dividend (with CARRY) 1115 rol r_divdH 1116 rol r_divdHL 1117 rol r_divdHH 1118 brcs __udivusa3_ep ; dividend overflow 1119 cp r_divdL,r_divL ; compare dividend & divisor 1120 cpc r_divdH,r_divH 1121 cpc r_divdHL,r_divHL 1122 cpc r_divdHH,r_divHH 1123 brcc __udivusa3_ep ; dividend >= divisor 1124 rol r_quoL ; shift quotient (with CARRY) 1125 rjmp __udivusa3_cont 1126__udivusa3_ep: 1127 sub r_divdL,r_divL ; restore dividend 1128 sbc r_divdH,r_divH 1129 sbc r_divdHL,r_divHL 1130 sbc r_divdHH,r_divHH 1131 lsl r_quoL ; shift quotient (without CARRY) 1132__udivusa3_cont: 1133 rol r_quoH ; shift quotient 1134 rol r_quoHL 1135 rol r_quoHH 1136 dec r_cnt ; decrement loop counter 1137 brne __udivusa3_loop 1138 com r_quoL ; complement result 1139 com r_quoH ; because C flag was complemented in loop 1140 com r_quoHL 1141 com r_quoHH 1142 ret 1143ENDF __udivusa3 1144#endif /* defined (L_udivusa3) */ 1145 1146#undef r_arg1L 1147#undef r_arg1H 1148#undef r_arg1HL 1149#undef r_arg1HH 1150#undef r_divdL 1151#undef r_divdH 1152#undef r_divdHL 1153#undef r_divdHH 1154#undef r_quoL 1155#undef r_quoH 1156#undef r_quoHL 1157#undef r_quoHH 1158#undef r_divL 1159#undef r_divH 1160#undef r_divHL 1161#undef r_divHH 1162#undef r_cnt 1163 1164 1165;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1166;; Saturation, 1 Byte 1167;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1168 1169;; First Argument and Return Register 1170#define A0 24 1171 1172#if defined (L_ssabs_1) 1173DEFUN __ssabs_1 1174 sbrs A0, 7 1175 ret 1176 neg A0 1177 sbrc A0,7 1178 dec A0 1179 ret 1180ENDF __ssabs_1 1181#endif /* L_ssabs_1 */ 1182 1183#undef A0 1184 1185 1186 1187;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1188;; Saturation, 2 Bytes 1189;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1190 1191;; First Argument and Return Register 1192#define A0 24 1193#define A1 A0+1 1194 1195#if defined (L_ssneg_2) 1196DEFUN __ssneg_2 1197 NEG2 A0 1198 brvc 0f 1199 sbiw A0, 1 12000: ret 1201ENDF __ssneg_2 1202#endif /* L_ssneg_2 */ 1203 1204#if defined (L_ssabs_2) 1205DEFUN __ssabs_2 1206 sbrs A1, 7 1207 ret 1208 XJMP __ssneg_2 1209ENDF __ssabs_2 1210#endif /* L_ssabs_2 */ 1211 1212#undef A0 1213#undef A1 1214 1215 1216 1217;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1218;; Saturation, 4 Bytes 1219;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1220 1221;; First Argument and Return Register 1222#define A0 22 1223#define A1 A0+1 1224#define A2 A0+2 1225#define A3 A0+3 1226 1227#if defined (L_ssneg_4) 1228DEFUN __ssneg_4 1229 XCALL __negsi2 1230 brvc 0f 1231 ldi A3, 0x7f 1232 ldi A2, 0xff 1233 ldi A1, 0xff 1234 ldi A0, 0xff 12350: ret 1236ENDF __ssneg_4 1237#endif /* L_ssneg_4 */ 1238 1239#if defined (L_ssabs_4) 1240DEFUN __ssabs_4 1241 sbrs A3, 7 1242 ret 1243 XJMP __ssneg_4 1244ENDF __ssabs_4 1245#endif /* L_ssabs_4 */ 1246 1247#undef A0 1248#undef A1 1249#undef A2 1250#undef A3 1251 1252 1253 1254;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1255;; Saturation, 8 Bytes 1256;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1257 1258;; First Argument and Return Register 1259#define A0 18 1260#define A1 A0+1 1261#define A2 A0+2 1262#define A3 A0+3 1263#define A4 A0+4 1264#define A5 A0+5 1265#define A6 A0+6 1266#define A7 A0+7 1267 1268#if defined (L_clr_8) 1269FALIAS __usneguta2 1270FALIAS __usneguda2 1271FALIAS __usnegudq2 1272 1273;; Clear Carry and all Bytes 1274DEFUN __clr_8 1275 ;; Clear Carry and set Z 1276 sub A7, A7 1277 ;; FALLTHRU 1278ENDF __clr_8 1279;; Propagate Carry to all Bytes, Carry unaltered 1280DEFUN __sbc_8 1281 sbc A7, A7 1282 sbc A6, A6 1283 wmov A4, A6 1284 wmov A2, A6 1285 wmov A0, A6 1286 ret 1287ENDF __sbc_8 1288#endif /* L_clr_8 */ 1289 1290#if defined (L_ssneg_8) 1291FALIAS __ssnegta2 1292FALIAS __ssnegda2 1293FALIAS __ssnegdq2 1294 1295DEFUN __ssneg_8 1296 XCALL __negdi2 1297 brvc 0f 1298 ;; A[] = 0x7fffffff 1299 sec 1300 XCALL __sbc_8 1301 ldi A7, 0x7f 13020: ret 1303ENDF __ssneg_8 1304#endif /* L_ssneg_8 */ 1305 1306#if defined (L_ssabs_8) 1307FALIAS __ssabsta2 1308FALIAS __ssabsda2 1309FALIAS __ssabsdq2 1310 1311DEFUN __ssabs_8 1312 sbrs A7, 7 1313 ret 1314 XJMP __ssneg_8 1315ENDF __ssabs_8 1316#endif /* L_ssabs_8 */ 1317 1318;; Second Argument 1319#define B0 10 1320#define B1 B0+1 1321#define B2 B0+2 1322#define B3 B0+3 1323#define B4 B0+4 1324#define B5 B0+5 1325#define B6 B0+6 1326#define B7 B0+7 1327 1328#if defined (L_usadd_8) 1329FALIAS __usadduta3 1330FALIAS __usadduda3 1331FALIAS __usaddudq3 1332 1333DEFUN __usadd_8 1334 XCALL __adddi3 1335 brcs 0f 1336 ret 13370: ;; A[] = 0xffffffff 1338 XJMP __sbc_8 1339ENDF __usadd_8 1340#endif /* L_usadd_8 */ 1341 1342#if defined (L_ussub_8) 1343FALIAS __ussubuta3 1344FALIAS __ussubuda3 1345FALIAS __ussubudq3 1346 1347DEFUN __ussub_8 1348 XCALL __subdi3 1349 brcs 0f 1350 ret 13510: ;; A[] = 0 1352 XJMP __clr_8 1353ENDF __ussub_8 1354#endif /* L_ussub_8 */ 1355 1356#if defined (L_ssadd_8) 1357FALIAS __ssaddta3 1358FALIAS __ssaddda3 1359FALIAS __ssadddq3 1360 1361DEFUN __ssadd_8 1362 XCALL __adddi3 1363 brvc 0f 1364 ;; A = (B >= 0) ? INT64_MAX : INT64_MIN 1365 cpi B7, 0x80 1366 XCALL __sbc_8 1367 subi A7, 0x80 13680: ret 1369ENDF __ssadd_8 1370#endif /* L_ssadd_8 */ 1371 1372#if defined (L_sssub_8) 1373FALIAS __sssubta3 1374FALIAS __sssubda3 1375FALIAS __sssubdq3 1376 1377DEFUN __sssub_8 1378 XCALL __subdi3 1379 brvc 0f 1380 ;; A = (B < 0) ? INT64_MAX : INT64_MIN 1381 ldi A7, 0x7f 1382 cp A7, B7 1383 XCALL __sbc_8 1384 subi A7, 0x80 13850: ret 1386ENDF __sssub_8 1387#endif /* L_sssub_8 */ 1388 1389#undef A0 1390#undef A1 1391#undef A2 1392#undef A3 1393#undef A4 1394#undef A5 1395#undef A6 1396#undef A7 1397#undef B0 1398#undef B1 1399#undef B2 1400#undef B3 1401#undef B4 1402#undef B5 1403#undef B6 1404#undef B7 1405 1406 1407;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1408;; Rounding Helpers 1409;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1410 1411#ifdef L_mask1 1412 1413#define AA 24 1414#define CC 25 1415 1416;; R25 = 1 << (R24 & 7) 1417;; CC = 1 << (AA & 7) 1418;; Clobbers: None 1419DEFUN __mask1 1420 ;; CC = 2 ^ AA.1 1421 ldi CC, 1 << 2 1422 sbrs AA, 1 1423 ldi CC, 1 << 0 1424 ;; CC *= 2 ^ AA.0 1425 sbrc AA, 0 1426 lsl CC 1427 ;; CC *= 2 ^ AA.2 1428 sbrc AA, 2 1429 swap CC 1430 ret 1431ENDF __mask1 1432 1433#undef AA 1434#undef CC 1435#endif /* L_mask1 */ 1436 1437;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1438 1439;; The rounding point. Any bits smaller than 1440;; 2^{-RP} will be cleared. 1441#define RP R24 1442 1443#define A0 22 1444#define A1 A0 + 1 1445 1446#define C0 24 1447#define C1 C0 + 1 1448 1449;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1450;; Rounding, 1 Byte 1451;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1452 1453#ifdef L_roundqq3 1454 1455;; R24 = round (R22, R24) 1456;; Clobbers: R22, __tmp_reg__ 1457DEFUN __roundqq3 1458 mov __tmp_reg__, C1 1459 subi RP, __QQ_FBIT__ - 1 1460 neg RP 1461 ;; R25 = 1 << RP (Total offset is FBIT-1 - RP) 1462 XCALL __mask1 1463 mov C0, C1 1464 ;; Add-Saturate 2^{-RP-1} 1465 add A0, C0 1466 brvc 0f 1467 ldi C0, 0x7f 1468 rjmp 9f 14690: ;; Mask out bits beyond RP 1470 lsl C0 1471 neg C0 1472 and C0, A0 14739: mov C1, __tmp_reg__ 1474 ret 1475ENDF __roundqq3 1476#endif /* L_roundqq3 */ 1477 1478#ifdef L_rounduqq3 1479 1480;; R24 = round (R22, R24) 1481;; Clobbers: R22, __tmp_reg__ 1482DEFUN __rounduqq3 1483 mov __tmp_reg__, C1 1484 subi RP, __UQQ_FBIT__ - 1 1485 neg RP 1486 ;; R25 = 1 << RP (Total offset is FBIT-1 - RP) 1487 XCALL __mask1 1488 mov C0, C1 1489 ;; Add-Saturate 2^{-RP-1} 1490 add A0, C0 1491 brcc 0f 1492 ldi C0, 0xff 1493 rjmp 9f 14940: ;; Mask out bits beyond RP 1495 lsl C0 1496 neg C0 1497 and C0, A0 14989: mov C1, __tmp_reg__ 1499 ret 1500ENDF __rounduqq3 1501#endif /* L_rounduqq3 */ 1502 1503;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1504;; Rounding, 2 Bytes 1505;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1506 1507#ifdef L_addmask_2 1508 1509;; [ R25:R24 = 1 << (R24 & 15) 1510;; R23:R22 += 1 << (R24 & 15) ] 1511;; SREG is set according to the addition 1512DEFUN __addmask_2 1513 ;; R25 = 1 << (R24 & 7) 1514 XCALL __mask1 1515 cpi RP, 1 << 3 1516 sbc C0, C0 1517 ;; Swap C0 and C1 if RP.3 was set 1518 and C0, C1 1519 eor C1, C0 1520 ;; Finally, add the power-of-two: A[] += C[] 1521 add A0, C0 1522 adc A1, C1 1523 ret 1524ENDF __addmask_2 1525#endif /* L_addmask_2 */ 1526 1527#ifdef L_round_s2 1528 1529;; R25:R24 = round (R23:R22, R24) 1530;; Clobbers: R23, R22 1531DEFUN __roundhq3 1532 subi RP, __HQ_FBIT__ - __HA_FBIT__ 1533ENDF __roundhq3 1534DEFUN __roundha3 1535 subi RP, __HA_FBIT__ - 1 1536 neg RP 1537 ;; [ R25:R24 = 1 << (FBIT-1 - RP) 1538 ;; R23:R22 += 1 << (FBIT-1 - RP) ] 1539 XCALL __addmask_2 1540 XJMP __round_s2_const 1541ENDF __roundha3 1542 1543#endif /* L_round_s2 */ 1544 1545#ifdef L_round_u2 1546 1547;; R25:R24 = round (R23:R22, R24) 1548;; Clobbers: R23, R22 1549DEFUN __rounduhq3 1550 subi RP, __UHQ_FBIT__ - __UHA_FBIT__ 1551ENDF __rounduhq3 1552DEFUN __rounduha3 1553 subi RP, __UHA_FBIT__ - 1 1554 neg RP 1555 ;; [ R25:R24 = 1 << (FBIT-1 - RP) 1556 ;; R23:R22 += 1 << (FBIT-1 - RP) ] 1557 XCALL __addmask_2 1558 XJMP __round_u2_const 1559ENDF __rounduha3 1560 1561#endif /* L_round_u2 */ 1562 1563 1564#ifdef L_round_2_const 1565 1566;; Helpers for 2 byte wide rounding 1567 1568DEFUN __round_s2_const 1569 brvc 2f 1570 ldi C1, 0x7f 1571 rjmp 1f 1572 ;; FALLTHRU (Barrier) 1573ENDF __round_s2_const 1574 1575DEFUN __round_u2_const 1576 brcc 2f 1577 ldi C1, 0xff 15781: 1579 ldi C0, 0xff 1580 rjmp 9f 15812: 1582 ;; Saturation is performed now. 1583 ;; Currently, we have C[] = 2^{-RP-1} 1584 ;; C[] = 2^{-RP} 1585 lsl C0 1586 rol C1 1587 ;; 1588 NEG2 C0 1589 ;; Clear the bits beyond the rounding point. 1590 and C0, A0 1591 and C1, A1 15929: ret 1593ENDF __round_u2_const 1594 1595#endif /* L_round_2_const */ 1596 1597#undef A0 1598#undef A1 1599#undef C0 1600#undef C1 1601 1602;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1603;; Rounding, 4 Bytes 1604;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1605 1606#define A0 18 1607#define A1 A0 + 1 1608#define A2 A0 + 2 1609#define A3 A0 + 3 1610 1611#define C0 22 1612#define C1 C0 + 1 1613#define C2 C0 + 2 1614#define C3 C0 + 3 1615 1616#ifdef L_addmask_4 1617 1618;; [ R25:R22 = 1 << (R24 & 31) 1619;; R21:R18 += 1 << (R24 & 31) ] 1620;; SREG is set according to the addition 1621DEFUN __addmask_4 1622 ;; R25 = 1 << (R24 & 7) 1623 XCALL __mask1 1624 cpi RP, 1 << 4 1625 sbc C0, C0 1626 sbc C1, C1 1627 ;; Swap C2 with C3 if RP.3 is not set 1628 cpi RP, 1 << 3 1629 sbc C2, C2 1630 and C2, C3 1631 eor C3, C2 1632 ;; Swap C3:C2 with C1:C0 if RP.4 is not set 1633 and C0, C2 $ eor C2, C0 1634 and C1, C3 $ eor C3, C1 1635 ;; Finally, add the power-of-two: A[] += C[] 1636 add A0, C0 1637 adc A1, C1 1638 adc A2, C2 1639 adc A3, C3 1640 ret 1641ENDF __addmask_4 1642#endif /* L_addmask_4 */ 1643 1644#ifdef L_round_s4 1645 1646;; R25:R22 = round (R21:R18, R24) 1647;; Clobbers: R18...R21 1648DEFUN __roundsq3 1649 subi RP, __SQ_FBIT__ - __SA_FBIT__ 1650ENDF __roundsq3 1651DEFUN __roundsa3 1652 subi RP, __SA_FBIT__ - 1 1653 neg RP 1654 ;; [ R25:R22 = 1 << (FBIT-1 - RP) 1655 ;; R21:R18 += 1 << (FBIT-1 - RP) ] 1656 XCALL __addmask_4 1657 XJMP __round_s4_const 1658ENDF __roundsa3 1659 1660#endif /* L_round_s4 */ 1661 1662#ifdef L_round_u4 1663 1664;; R25:R22 = round (R21:R18, R24) 1665;; Clobbers: R18...R21 1666DEFUN __roundusq3 1667 subi RP, __USQ_FBIT__ - __USA_FBIT__ 1668ENDF __roundusq3 1669DEFUN __roundusa3 1670 subi RP, __USA_FBIT__ - 1 1671 neg RP 1672 ;; [ R25:R22 = 1 << (FBIT-1 - RP) 1673 ;; R21:R18 += 1 << (FBIT-1 - RP) ] 1674 XCALL __addmask_4 1675 XJMP __round_u4_const 1676ENDF __roundusa3 1677 1678#endif /* L_round_u4 */ 1679 1680 1681#ifdef L_round_4_const 1682 1683;; Helpers for 4 byte wide rounding 1684 1685DEFUN __round_s4_const 1686 brvc 2f 1687 ldi C3, 0x7f 1688 rjmp 1f 1689 ;; FALLTHRU (Barrier) 1690ENDF __round_s4_const 1691 1692DEFUN __round_u4_const 1693 brcc 2f 1694 ldi C3, 0xff 16951: 1696 ldi C2, 0xff 1697 ldi C1, 0xff 1698 ldi C0, 0xff 1699 rjmp 9f 17002: 1701 ;; Saturation is performed now. 1702 ;; Currently, we have C[] = 2^{-RP-1} 1703 ;; C[] = 2^{-RP} 1704 lsl C0 1705 rol C1 1706 rol C2 1707 rol C3 1708 XCALL __negsi2 1709 ;; Clear the bits beyond the rounding point. 1710 and C0, A0 1711 and C1, A1 1712 and C2, A2 1713 and C3, A3 17149: ret 1715ENDF __round_u4_const 1716 1717#endif /* L_round_4_const */ 1718 1719#undef A0 1720#undef A1 1721#undef A2 1722#undef A3 1723#undef C0 1724#undef C1 1725#undef C2 1726#undef C3 1727 1728#undef RP 1729 1730;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1731;; Rounding, 8 Bytes 1732;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1733 1734#define RP 16 1735#define FBITm1 31 1736 1737#define C0 18 1738#define C1 C0 + 1 1739#define C2 C0 + 2 1740#define C3 C0 + 3 1741#define C4 C0 + 4 1742#define C5 C0 + 5 1743#define C6 C0 + 6 1744#define C7 C0 + 7 1745 1746#define A0 16 1747#define A1 17 1748#define A2 26 1749#define A3 27 1750#define A4 28 1751#define A5 29 1752#define A6 30 1753#define A7 31 1754 1755 1756#ifdef L_rounddq3 1757;; R25:R18 = round (R25:R18, R16) 1758;; Clobbers: ABI 1759DEFUN __rounddq3 1760 ldi FBITm1, __DQ_FBIT__ - 1 1761 clt 1762 XJMP __round_x8 1763ENDF __rounddq3 1764#endif /* L_rounddq3 */ 1765 1766#ifdef L_roundudq3 1767;; R25:R18 = round (R25:R18, R16) 1768;; Clobbers: ABI 1769DEFUN __roundudq3 1770 ldi FBITm1, __UDQ_FBIT__ - 1 1771 set 1772 XJMP __round_x8 1773ENDF __roundudq3 1774#endif /* L_roundudq3 */ 1775 1776#ifdef L_roundda3 1777;; R25:R18 = round (R25:R18, R16) 1778;; Clobbers: ABI 1779DEFUN __roundda3 1780 ldi FBITm1, __DA_FBIT__ - 1 1781 clt 1782 XJMP __round_x8 1783ENDF __roundda3 1784#endif /* L_roundda3 */ 1785 1786#ifdef L_rounduda3 1787;; R25:R18 = round (R25:R18, R16) 1788;; Clobbers: ABI 1789DEFUN __rounduda3 1790 ldi FBITm1, __UDA_FBIT__ - 1 1791 set 1792 XJMP __round_x8 1793ENDF __rounduda3 1794#endif /* L_rounduda3 */ 1795 1796#ifdef L_roundta3 1797;; R25:R18 = round (R25:R18, R16) 1798;; Clobbers: ABI 1799DEFUN __roundta3 1800 ldi FBITm1, __TA_FBIT__ - 1 1801 clt 1802 XJMP __round_x8 1803ENDF __roundta3 1804#endif /* L_roundta3 */ 1805 1806#ifdef L_rounduta3 1807;; R25:R18 = round (R25:R18, R16) 1808;; Clobbers: ABI 1809DEFUN __rounduta3 1810 ldi FBITm1, __UTA_FBIT__ - 1 1811 set 1812 XJMP __round_x8 1813ENDF __rounduta3 1814#endif /* L_rounduta3 */ 1815 1816 1817#ifdef L_round_x8 1818DEFUN __round_x8 1819 push r16 1820 push r17 1821 push r28 1822 push r29 1823 ;; Compute log2 of addend from rounding point 1824 sub RP, FBITm1 1825 neg RP 1826 ;; Move input to work register A[] 1827 push C0 1828 mov A1, C1 1829 wmov A2, C2 1830 wmov A4, C4 1831 wmov A6, C6 1832 ;; C[] = 1 << (FBIT-1 - RP) 1833 XCALL __clr_8 1834 inc C0 1835 XCALL __ashldi3 1836 pop A0 1837 ;; A[] += C[] 1838 add A0, C0 1839 adc A1, C1 1840 adc A2, C2 1841 adc A3, C3 1842 adc A4, C4 1843 adc A5, C5 1844 adc A6, C6 1845 adc A7, C7 1846 brts 1f 1847 ;; Signed 1848 brvc 3f 1849 ;; Signed overflow: A[] = 0x7f... 1850 brvs 2f 18511: ;; Unsigned 1852 brcc 3f 1853 ;; Unsigned overflow: A[] = 0xff... 18542: ldi C7, 0xff 1855 ldi C6, 0xff 1856 wmov C0, C6 1857 wmov C2, C6 1858 wmov C4, C6 1859 bld C7, 7 1860 rjmp 9f 18613: 1862 ;; C[] = -C[] - C[] 1863 push A0 1864 ldi r16, 1 1865 XCALL __ashldi3 1866 pop A0 1867 XCALL __negdi2 1868 ;; Clear the bits beyond the rounding point. 1869 and C0, A0 1870 and C1, A1 1871 and C2, A2 1872 and C3, A3 1873 and C4, A4 1874 and C5, A5 1875 and C6, A6 1876 and C7, A7 18779: ;; Epilogue 1878 pop r29 1879 pop r28 1880 pop r17 1881 pop r16 1882 ret 1883ENDF __round_x8 1884 1885#endif /* L_round_x8 */ 1886 1887#undef A0 1888#undef A1 1889#undef A2 1890#undef A3 1891#undef A4 1892#undef A5 1893#undef A6 1894#undef A7 1895 1896#undef C0 1897#undef C1 1898#undef C2 1899#undef C3 1900#undef C4 1901#undef C5 1902#undef C6 1903#undef C7 1904 1905#undef RP 1906#undef FBITm1 1907 1908 1909;; Supply implementations / symbols for the bit-banging functions 1910;; __builtin_avr_bitsfx and __builtin_avr_fxbits 1911#ifdef L_ret 1912DEFUN __ret 1913 ret 1914ENDF __ret 1915#endif /* L_ret */ 1916