xref: /netbsd-src/external/gpl3/gcc/dist/libgcc/config/avr/lib1funcs-fixed.S (revision b7b7574d3bf8eeb51a1fa3977b59142ec6434a55)
1/*  -*- Mode: Asm -*-  */
2;;    Copyright (C) 2012-2013 Free Software Foundation, Inc.
3;;    Contributed by Sean D'Epagnier  (sean@depagnier.com)
4;;                   Georg-Johann Lay (avr@gjlay.de)
5
6;; This file is free software; you can redistribute it and/or modify it
7;; under the terms of the GNU General Public License as published by the
8;; Free Software Foundation; either version 3, or (at your option) any
9;; later version.
10
11;; In addition to the permissions in the GNU General Public License, the
12;; Free Software Foundation gives you unlimited permission to link the
13;; compiled version of this file into combinations with other programs,
14;; and to distribute those combinations without any restriction coming
15;; from the use of this file.  (The General Public License restrictions
16;; do apply in other respects; for example, they cover modification of
17;; the file, and distribution when not linked into a combine
18;; executable.)
19
20;; This file is distributed in the hope that it will be useful, but
21;; WITHOUT ANY WARRANTY; without even the implied warranty of
22;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
23;; General Public License for more details.
24
25;; You should have received a copy of the GNU General Public License
26;; along with this program; see the file COPYING.  If not, write to
27;; the Free Software Foundation, 51 Franklin Street, Fifth Floor,
28;; Boston, MA 02110-1301, USA.
29
30;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
31;; Fixed point library routines for AVR
32;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
33
34.section .text.libgcc.fixed, "ax", @progbits
35
36;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
37;; Conversions to float
38;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
39
40#if defined (L_fractqqsf)
41DEFUN __fractqqsf
42    ;; Move in place for SA -> SF conversion
43    clr     r22
44    mov     r23, r24
45    ;; Sign-extend
46    lsl     r24
47    sbc     r24, r24
48    mov     r25, r24
49    XJMP    __fractsasf
50ENDF __fractqqsf
51#endif  /* L_fractqqsf */
52
53#if defined (L_fractuqqsf)
54DEFUN __fractuqqsf
55    ;; Move in place for USA -> SF conversion
56    clr     r22
57    mov     r23, r24
58    ;; Zero-extend
59    clr     r24
60    clr     r25
61    XJMP    __fractusasf
62ENDF __fractuqqsf
63#endif  /* L_fractuqqsf */
64
65#if defined (L_fracthqsf)
66DEFUN __fracthqsf
67    ;; Move in place for SA -> SF conversion
68    wmov    22, 24
69    ;; Sign-extend
70    lsl     r25
71    sbc     r24, r24
72    mov     r25, r24
73    XJMP    __fractsasf
74ENDF __fracthqsf
75#endif  /* L_fracthqsf */
76
77#if defined (L_fractuhqsf)
78DEFUN __fractuhqsf
79    ;; Move in place for USA -> SF conversion
80    wmov    22, 24
81    ;; Zero-extend
82    clr     r24
83    clr     r25
84    XJMP    __fractusasf
85ENDF __fractuhqsf
86#endif  /* L_fractuhqsf */
87
88#if defined (L_fracthasf)
89DEFUN __fracthasf
90    ;; Move in place for SA -> SF conversion
91    clr     r22
92    mov     r23, r24
93    mov     r24, r25
94    ;; Sign-extend
95    lsl     r25
96    sbc     r25, r25
97    XJMP    __fractsasf
98ENDF __fracthasf
99#endif  /* L_fracthasf */
100
101#if defined (L_fractuhasf)
102DEFUN __fractuhasf
103    ;; Move in place for USA -> SF conversion
104    clr     r22
105    mov     r23, r24
106    mov     r24, r25
107    ;; Zero-extend
108    clr     r25
109    XJMP    __fractusasf
110ENDF __fractuhasf
111#endif  /* L_fractuhasf */
112
113
114#if defined (L_fractsqsf)
115DEFUN __fractsqsf
116    XCALL   __floatsisf
117    ;; Divide non-zero results by 2^31 to move the
118    ;; decimal point into place
119    tst     r25
120    breq    0f
121    subi    r24, exp_lo (31)
122    sbci    r25, exp_hi (31)
1230:  ret
124ENDF __fractsqsf
125#endif  /* L_fractsqsf */
126
127#if defined (L_fractusqsf)
128DEFUN __fractusqsf
129    XCALL   __floatunsisf
130    ;; Divide non-zero results by 2^32 to move the
131    ;; decimal point into place
132    cpse    r25, __zero_reg__
133    subi    r25, exp_hi (32)
134    ret
135ENDF __fractusqsf
136#endif  /* L_fractusqsf */
137
138#if defined (L_fractsasf)
139DEFUN __fractsasf
140    XCALL   __floatsisf
141    ;; Divide non-zero results by 2^15 to move the
142    ;; decimal point into place
143    tst     r25
144    breq    0f
145    subi    r24, exp_lo (15)
146    sbci    r25, exp_hi (15)
1470:  ret
148ENDF __fractsasf
149#endif  /* L_fractsasf */
150
151#if defined (L_fractusasf)
152DEFUN __fractusasf
153    XCALL   __floatunsisf
154    ;; Divide non-zero results by 2^16 to move the
155    ;; decimal point into place
156    cpse    r25, __zero_reg__
157    subi    r25, exp_hi (16)
158    ret
159ENDF __fractusasf
160#endif  /* L_fractusasf */
161
162;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
163;; Conversions from float
164;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
165
166#if defined (L_fractsfqq)
167DEFUN __fractsfqq
168    ;; Multiply with 2^{24+7} to get a QQ result in r25
169    subi    r24, exp_lo (-31)
170    sbci    r25, exp_hi (-31)
171    XCALL   __fixsfsi
172    mov     r24, r25
173    ret
174ENDF __fractsfqq
175#endif  /* L_fractsfqq */
176
177#if defined (L_fractsfuqq)
178DEFUN __fractsfuqq
179    ;; Multiply with 2^{24+8} to get a UQQ result in r25
180    subi    r25, exp_hi (-32)
181    XCALL   __fixunssfsi
182    mov     r24, r25
183    ret
184ENDF __fractsfuqq
185#endif  /* L_fractsfuqq */
186
187#if defined (L_fractsfha)
188DEFUN __fractsfha
189    ;; Multiply with 2^{16+7} to get a HA result in r25:r24
190    subi    r24, exp_lo (-23)
191    sbci    r25, exp_hi (-23)
192    XJMP    __fixsfsi
193ENDF __fractsfha
194#endif  /* L_fractsfha */
195
196#if defined (L_fractsfuha)
197DEFUN __fractsfuha
198    ;; Multiply with 2^24 to get a UHA result in r25:r24
199    subi    r25, exp_hi (-24)
200    XJMP    __fixunssfsi
201ENDF __fractsfuha
202#endif  /* L_fractsfuha */
203
204#if defined (L_fractsfhq)
205FALIAS __fractsfsq
206
207DEFUN __fractsfhq
208    ;; Multiply with 2^{16+15} to get a HQ result in r25:r24
209    ;; resp. with 2^31 to get a SQ result in r25:r22
210    subi    r24, exp_lo (-31)
211    sbci    r25, exp_hi (-31)
212    XJMP    __fixsfsi
213ENDF __fractsfhq
214#endif  /* L_fractsfhq */
215
216#if defined (L_fractsfuhq)
217FALIAS __fractsfusq
218
219DEFUN __fractsfuhq
220    ;; Multiply with 2^{16+16} to get a UHQ result in r25:r24
221    ;; resp. with 2^32 to get a USQ result in r25:r22
222    subi    r25, exp_hi (-32)
223    XJMP    __fixunssfsi
224ENDF __fractsfuhq
225#endif  /* L_fractsfuhq */
226
227#if defined (L_fractsfsa)
228DEFUN __fractsfsa
229    ;; Multiply with 2^15 to get a SA result in r25:r22
230    subi    r24, exp_lo (-15)
231    sbci    r25, exp_hi (-15)
232    XJMP    __fixsfsi
233ENDF __fractsfsa
234#endif  /* L_fractsfsa */
235
236#if defined (L_fractsfusa)
237DEFUN __fractsfusa
238    ;; Multiply with 2^16 to get a USA result in r25:r22
239    subi    r25, exp_hi (-16)
240    XJMP    __fixunssfsi
241ENDF __fractsfusa
242#endif  /* L_fractsfusa */
243
244
245;; For multiplication the functions here are called directly from
246;; avr-fixed.md instead of using the standard libcall mechanisms.
247;; This can make better code because GCC knows exactly which
248;; of the call-used registers (not all of them) are clobbered.  */
249
250/*******************************************************
251    Fractional  Multiplication  8 x 8  without MUL
252*******************************************************/
253
254#if defined (L_mulqq3) && !defined (__AVR_HAVE_MUL__)
255;;; R23 = R24 * R25
256;;; Clobbers: __tmp_reg__, R22, R24, R25
257;;; Rounding: ???
258DEFUN __mulqq3
259    XCALL   __fmuls
260    ;; TR 18037 requires that  (-1) * (-1)  does not overflow
261    ;; The only input that can produce  -1  is  (-1)^2.
262    dec     r23
263    brvs    0f
264    inc     r23
2650:  ret
266ENDF  __mulqq3
267#endif /* L_mulqq3 && ! HAVE_MUL */
268
269/*******************************************************
270    Fractional Multiply  .16 x .16  with and without MUL
271*******************************************************/
272
273#if defined (L_mulhq3)
274;;; Same code with and without MUL, but the interfaces differ:
275;;; no MUL: (R25:R24) = (R22:R23) * (R24:R25)
276;;;         Clobbers: ABI, called by optabs
277;;; MUL:    (R25:R24) = (R19:R18) * (R27:R26)
278;;;         Clobbers: __tmp_reg__, R22, R23
279;;; Rounding:  -0.5 LSB  <= error  <=  0.5 LSB
280DEFUN   __mulhq3
281    XCALL   __mulhisi3
282    ;; Shift result into place
283    lsl     r23
284    rol     r24
285    rol     r25
286    brvs    1f
287    ;; Round
288    sbrc    r23, 7
289    adiw    r24, 1
290    ret
2911:  ;; Overflow.  TR 18037 requires  (-1)^2  not to overflow
292    ldi     r24, lo8 (0x7fff)
293    ldi     r25, hi8 (0x7fff)
294    ret
295ENDF __mulhq3
296#endif  /* defined (L_mulhq3) */
297
298#if defined (L_muluhq3)
299;;; Same code with and without MUL, but the interfaces differ:
300;;; no MUL: (R25:R24) *= (R23:R22)
301;;;         Clobbers: ABI, called by optabs
302;;; MUL:    (R25:R24) = (R19:R18) * (R27:R26)
303;;;         Clobbers: __tmp_reg__, R22, R23
304;;; Rounding:  -0.5 LSB  <  error  <=  0.5 LSB
305DEFUN   __muluhq3
306    XCALL   __umulhisi3
307    ;; Round
308    sbrc    r23, 7
309    adiw    r24, 1
310    ret
311ENDF __muluhq3
312#endif  /* L_muluhq3 */
313
314
315/*******************************************************
316    Fixed  Multiply  8.8 x 8.8  with and without MUL
317*******************************************************/
318
319#if defined (L_mulha3)
320;;; Same code with and without MUL, but the interfaces differ:
321;;; no MUL: (R25:R24) = (R22:R23) * (R24:R25)
322;;;         Clobbers: ABI, called by optabs
323;;; MUL:    (R25:R24) = (R19:R18) * (R27:R26)
324;;;         Clobbers: __tmp_reg__, R22, R23
325;;; Rounding:  -0.5 LSB  <=  error  <=  0.5 LSB
326DEFUN   __mulha3
327    XCALL   __mulhisi3
328    lsl     r22
329    rol     r23
330    rol     r24
331    XJMP    __muluha3_round
332ENDF __mulha3
333#endif  /* L_mulha3 */
334
335#if defined (L_muluha3)
336;;; Same code with and without MUL, but the interfaces differ:
337;;; no MUL: (R25:R24) *= (R23:R22)
338;;;         Clobbers: ABI, called by optabs
339;;; MUL:    (R25:R24) = (R19:R18) * (R27:R26)
340;;;         Clobbers: __tmp_reg__, R22, R23
341;;; Rounding:  -0.5 LSB  <  error  <=  0.5 LSB
342DEFUN   __muluha3
343    XCALL   __umulhisi3
344    XJMP    __muluha3_round
345ENDF __muluha3
346#endif  /* L_muluha3 */
347
348#if defined (L_muluha3_round)
349DEFUN   __muluha3_round
350    ;; Shift result into place
351    mov     r25, r24
352    mov     r24, r23
353    ;; Round
354    sbrc    r22, 7
355    adiw    r24, 1
356    ret
357ENDF __muluha3_round
358#endif  /* L_muluha3_round */
359
360
361/*******************************************************
362    Fixed  Multiplication  16.16 x 16.16
363*******************************************************/
364
365;; Bits outside the result (below LSB), used in the signed version
366#define GUARD __tmp_reg__
367
368#if defined (__AVR_HAVE_MUL__)
369
370;; Multiplier
371#define A0  16
372#define A1  A0+1
373#define A2  A1+1
374#define A3  A2+1
375
376;; Multiplicand
377#define B0  20
378#define B1  B0+1
379#define B2  B1+1
380#define B3  B2+1
381
382;; Result
383#define C0  24
384#define C1  C0+1
385#define C2  C1+1
386#define C3  C2+1
387
388#if defined (L_mulusa3)
389;;; (C3:C0) = (A3:A0) * (B3:B0)
390DEFUN __mulusa3
391    set
392    ;; Fallthru
393ENDF  __mulusa3
394
395;;; Round for last digit iff T = 1
396;;; Return guard bits in GUARD (__tmp_reg__).
397;;; Rounding, T = 0:  -1.0 LSB  <  error  <=  0   LSB
398;;; Rounding, T = 1:  -0.5 LSB  <  error  <=  0.5 LSB
399DEFUN __mulusa3_round
400    ;; Some of the MUL instructions have LSBs outside the result.
401    ;; Don't ignore these LSBs in order to tame rounding error.
402    ;; Use C2/C3 for these LSBs.
403
404    clr C0
405    clr C1
406    mul A0, B0  $  movw C2, r0
407
408    mul A1, B0  $  add  C3, r0  $  adc C0, r1
409    mul A0, B1  $  add  C3, r0  $  adc C0, r1  $  rol C1
410
411    ;; Round if T = 1.  Store guarding bits outside the result for rounding
412    ;; and left-shift by the signed version (function below).
413    brtc 0f
414    sbrc C3, 7
415    adiw C0, 1
4160:  push C3
417
418    ;; The following MULs don't have LSBs outside the result.
419    ;; C2/C3 is the high part.
420
421    mul  A0, B2  $  add C0, r0  $  adc C1, r1  $  sbc  C2, C2
422    mul  A1, B1  $  add C0, r0  $  adc C1, r1  $  sbci C2, 0
423    mul  A2, B0  $  add C0, r0  $  adc C1, r1  $  sbci C2, 0
424    neg  C2
425
426    mul  A0, B3  $  add C1, r0  $  adc C2, r1  $  sbc  C3, C3
427    mul  A1, B2  $  add C1, r0  $  adc C2, r1  $  sbci C3, 0
428    mul  A2, B1  $  add C1, r0  $  adc C2, r1  $  sbci C3, 0
429    mul  A3, B0  $  add C1, r0  $  adc C2, r1  $  sbci C3, 0
430    neg  C3
431
432    mul  A1, B3  $  add C2, r0  $  adc C3, r1
433    mul  A2, B2  $  add C2, r0  $  adc C3, r1
434    mul  A3, B1  $  add C2, r0  $  adc C3, r1
435
436    mul  A2, B3  $  add C3, r0
437    mul  A3, B2  $  add C3, r0
438
439    ;; Guard bits used in the signed version below.
440    pop  GUARD
441    clr  __zero_reg__
442    ret
443ENDF __mulusa3_round
444#endif /* L_mulusa3 */
445
446#if defined (L_mulsa3)
447;;; (C3:C0) = (A3:A0) * (B3:B0)
448;;; Clobbers: __tmp_reg__, T
449;;; Rounding:  -0.5 LSB  <=  error  <=  0.5 LSB
450DEFUN __mulsa3
451    clt
452    XCALL   __mulusa3_round
453    ;; A posteriori sign extension of the operands
454    tst     B3
455    brpl 1f
456    sub     C2, A0
457    sbc     C3, A1
4581:  sbrs    A3, 7
459    rjmp 2f
460    sub     C2, B0
461    sbc     C3, B1
4622:
463    ;;  Shift 1 bit left to adjust for 15 fractional bits
464    lsl     GUARD
465    rol     C0
466    rol     C1
467    rol     C2
468    rol     C3
469    ;; Round last digit
470    lsl     GUARD
471    adc     C0, __zero_reg__
472    adc     C1, __zero_reg__
473    adc     C2, __zero_reg__
474    adc     C3, __zero_reg__
475    ret
476ENDF __mulsa3
477#endif /* L_mulsa3 */
478
479#undef A0
480#undef A1
481#undef A2
482#undef A3
483#undef B0
484#undef B1
485#undef B2
486#undef B3
487#undef C0
488#undef C1
489#undef C2
490#undef C3
491
492#else /* __AVR_HAVE_MUL__ */
493
494#define A0 18
495#define A1 A0+1
496#define A2 A0+2
497#define A3 A0+3
498
499#define B0 22
500#define B1 B0+1
501#define B2 B0+2
502#define B3 B0+3
503
504#define C0  22
505#define C1  C0+1
506#define C2  C0+2
507#define C3  C0+3
508
509;; __tmp_reg__
510#define CC0  0
511;; __zero_reg__
512#define CC1  1
513#define CC2  16
514#define CC3  17
515
516#define AA0  26
517#define AA1  AA0+1
518#define AA2  30
519#define AA3  AA2+1
520
521#if defined (L_mulsa3)
522;;; (R25:R22)  *=  (R21:R18)
523;;; Clobbers: ABI, called by optabs
524;;; Rounding:  -1 LSB  <=  error  <=  1 LSB
525DEFUN   __mulsa3
526    push    B0
527    push    B1
528    push    B3
529    clt
530    XCALL   __mulusa3_round
531    pop     r30
532    ;; sign-extend B
533    bst     r30, 7
534    brtc 1f
535    ;; A1, A0 survived in  R27:R26
536    sub     C2, AA0
537    sbc     C3, AA1
5381:
539    pop     AA1  ;; B1
540    pop     AA0  ;; B0
541
542    ;; sign-extend A.  A3 survived in  R31
543    bst     AA3, 7
544    brtc 2f
545    sub     C2, AA0
546    sbc     C3, AA1
5472:
548    ;;  Shift 1 bit left to adjust for 15 fractional bits
549    lsl     GUARD
550    rol     C0
551    rol     C1
552    rol     C2
553    rol     C3
554    ;; Round last digit
555    lsl     GUARD
556    adc     C0, __zero_reg__
557    adc     C1, __zero_reg__
558    adc     C2, __zero_reg__
559    adc     C3, __zero_reg__
560    ret
561ENDF __mulsa3
562#endif  /* L_mulsa3 */
563
564#if defined (L_mulusa3)
565;;; (R25:R22)  *=  (R21:R18)
566;;; Clobbers: ABI, called by optabs
567;;; Rounding:  -1 LSB  <=  error  <=  1 LSB
568DEFUN __mulusa3
569    set
570    ;; Fallthru
571ENDF  __mulusa3
572
573;;; A[] survives in 26, 27, 30, 31
574;;; Also used by __mulsa3 with T = 0
575;;; Round if T = 1
576;;; Return Guard bits in GUARD (__tmp_reg__), used by signed version.
577DEFUN __mulusa3_round
578    push    CC2
579    push    CC3
580    ; clear result
581    clr     __tmp_reg__
582    wmov    CC2, CC0
583    ; save multiplicand
584    wmov    AA0, A0
585    wmov    AA2, A2
586    rjmp 3f
587
588    ;; Loop the integral part
589
5901:  ;; CC += A * 2^n;  n >= 0
591    add  CC0,A0  $  adc CC1,A1  $  adc  CC2,A2  $  adc  CC3,A3
592
5932:  ;; A <<= 1
594    lsl  A0      $  rol A1      $  rol  A2      $  rol  A3
595
5963:  ;; IBIT(B) >>= 1
597    ;; Carry = n-th bit of B;  n >= 0
598    lsr     B3
599    ror     B2
600    brcs 1b
601    sbci    B3, 0
602    brne 2b
603
604    ;; Loop the fractional part
605    ;; B2/B3 is 0 now, use as guard bits for rounding
606    ;; Restore multiplicand
607    wmov    A0, AA0
608    wmov    A2, AA2
609    rjmp 5f
610
6114:  ;; CC += A:Guard * 2^n;  n < 0
612    add  B3,B2 $  adc  CC0,A0  $  adc  CC1,A1  $  adc  CC2,A2  $  adc  CC3,A3
6135:
614    ;; A:Guard >>= 1
615    lsr  A3   $  ror  A2  $  ror  A1  $  ror   A0  $   ror  B2
616
617    ;; FBIT(B) <<= 1
618    ;; Carry = n-th bit of B;  n < 0
619    lsl     B0
620    rol     B1
621    brcs 4b
622    sbci    B0, 0
623    brne 5b
624
625    ;; Save guard bits and set carry for rounding
626    push    B3
627    lsl     B3
628    ;; Move result into place
629    wmov    C2, CC2
630    wmov    C0, CC0
631    clr     __zero_reg__
632    brtc 6f
633    ;; Round iff T = 1
634    adc     C0, __zero_reg__
635    adc     C1, __zero_reg__
636    adc     C2, __zero_reg__
637    adc     C3, __zero_reg__
6386:
639    pop     GUARD
640    ;; Epilogue
641    pop     CC3
642    pop     CC2
643    ret
644ENDF __mulusa3_round
645#endif  /* L_mulusa3 */
646
647#undef A0
648#undef A1
649#undef A2
650#undef A3
651#undef B0
652#undef B1
653#undef B2
654#undef B3
655#undef C0
656#undef C1
657#undef C2
658#undef C3
659#undef AA0
660#undef AA1
661#undef AA2
662#undef AA3
663#undef CC0
664#undef CC1
665#undef CC2
666#undef CC3
667
668#endif /* __AVR_HAVE_MUL__ */
669
670#undef GUARD
671
672/***********************************************************
673    Fixed  unsigned saturated Multiplication  8.8 x 8.8
674***********************************************************/
675
676#define C0  22
677#define C1  C0+1
678#define C2  C0+2
679#define C3  C0+3
680#define SS __tmp_reg__
681
682#if defined (L_usmuluha3)
683DEFUN __usmuluha3
684    ;; Widening multiply
685#ifdef __AVR_HAVE_MUL__
686    ;; Adjust interface
687    movw    R26, R22
688    movw    R18, R24
689#endif /* HAVE MUL */
690    XCALL   __umulhisi3
691    tst     C3
692    brne .Lmax
693    ;; Round, target is in C1..C2
694    lsl     C0
695    adc     C1, __zero_reg__
696    adc     C2, __zero_reg__
697    brcs .Lmax
698    ;; Move result into place
699    mov     C3, C2
700    mov     C2, C1
701    ret
702.Lmax:
703    ;; Saturate
704    ldi     C2, 0xff
705    ldi     C3, 0xff
706    ret
707ENDF  __usmuluha3
708#endif /* L_usmuluha3 */
709
710/***********************************************************
711    Fixed signed saturated Multiplication  s8.7 x s8.7
712***********************************************************/
713
714#if defined (L_ssmulha3)
715DEFUN __ssmulha3
716    ;; Widening multiply
717#ifdef __AVR_HAVE_MUL__
718    ;; Adjust interface
719    movw    R26, R22
720    movw    R18, R24
721#endif /* HAVE MUL */
722    XCALL   __mulhisi3
723    ;; Adjust decimal point
724    lsl     C0
725    rol     C1
726    rol     C2
727    brvs .LsatC3.3
728    ;; The 9 MSBs must be the same
729    rol     C3
730    sbc     SS, SS
731    cp      C3, SS
732    brne .LsatSS
733    ;; Round
734    lsl     C0
735    adc     C1, __zero_reg__
736    adc     C2, __zero_reg__
737    brvs .Lmax
738    ;; Move result into place
739    mov    C3, C2
740    mov    C2, C1
741    ret
742.Lmax:
743    ;; Load 0x7fff
744    clr     C3
745.LsatC3.3:
746    ;; C3 <  0 -->  0x8000
747    ;; C3 >= 0 -->  0x7fff
748    mov     SS, C3
749.LsatSS:
750    ;; Load min / max value:
751    ;; SS = -1  -->  0x8000
752    ;; SS =  0  -->  0x7fff
753    ldi     C3, 0x7f
754    ldi     C2, 0xff
755    sbrc    SS, 7
756    adiw    C2, 1
757    ret
758ENDF  __ssmulha3
759#endif /* L_ssmulha3 */
760
761#undef C0
762#undef C1
763#undef C2
764#undef C3
765#undef SS
766
767/***********************************************************
768    Fixed  unsigned saturated Multiplication  16.16 x 16.16
769***********************************************************/
770
771#define C0  18
772#define C1  C0+1
773#define C2  C0+2
774#define C3  C0+3
775#define C4  C0+4
776#define C5  C0+5
777#define C6  C0+6
778#define C7  C0+7
779#define SS __tmp_reg__
780
781#if defined (L_usmulusa3)
782;; R22[4] = R22[4] *{ssat} R18[4]
783;; Ordinary ABI function
784DEFUN __usmulusa3
785    ;; Widening multiply
786    XCALL   __umulsidi3
787    or      C7, C6
788    brne .Lmax
789    ;; Round, target is in C2..C5
790    lsl     C1
791    adc     C2, __zero_reg__
792    adc     C3, __zero_reg__
793    adc     C4, __zero_reg__
794    adc     C5, __zero_reg__
795    brcs .Lmax
796    ;; Move result into place
797    wmov    C6, C4
798    wmov    C4, C2
799    ret
800.Lmax:
801    ;; Saturate
802    ldi     C7, 0xff
803    ldi     C6, 0xff
804    wmov    C4, C6
805    ret
806ENDF  __usmulusa3
807#endif /* L_usmulusa3 */
808
809/***********************************************************
810    Fixed signed saturated Multiplication  s16.15 x s16.15
811***********************************************************/
812
813#if defined (L_ssmulsa3)
814;; R22[4] = R22[4] *{ssat} R18[4]
815;; Ordinary ABI function
816DEFUN __ssmulsa3
817    ;; Widening multiply
818    XCALL   __mulsidi3
819    ;; Adjust decimal point
820    lsl     C1
821    rol     C2
822    rol     C3
823    rol     C4
824    rol     C5
825    brvs .LsatC7.7
826    ;; The 17 MSBs must be the same
827    rol     C6
828    rol     C7
829    sbc     SS, SS
830    cp      C6, SS
831    cpc     C7, SS
832    brne .LsatSS
833    ;; Round
834    lsl     C1
835    adc     C2, __zero_reg__
836    adc     C3, __zero_reg__
837    adc     C4, __zero_reg__
838    adc     C5, __zero_reg__
839    brvs .Lmax
840    ;; Move result into place
841    wmov    C6, C4
842    wmov    C4, C2
843    ret
844
845.Lmax:
846    ;; Load 0x7fffffff
847    clr     C7
848.LsatC7.7:
849    ;; C7 <  0 -->  0x80000000
850    ;; C7 >= 0 -->  0x7fffffff
851    lsl     C7
852    sbc     SS, SS
853.LsatSS:
854    ;; Load min / max value:
855    ;; SS = -1  -->  0x80000000
856    ;; SS =  0  -->  0x7fffffff
857    com     SS
858    mov     C4, SS
859    mov     C5, C4
860    wmov    C6, C4
861    subi    C7, 0x80
862    ret
863ENDF  __ssmulsa3
864#endif /* L_ssmulsa3 */
865
866#undef C0
867#undef C1
868#undef C2
869#undef C3
870#undef C4
871#undef C5
872#undef C6
873#undef C7
874#undef SS
875
876/*******************************************************
877      Fractional Division 8 / 8
878*******************************************************/
879
880#define r_divd  r25     /* dividend */
881#define r_quo   r24     /* quotient */
882#define r_div   r22     /* divisor */
883#define r_sign  __tmp_reg__
884
885#if defined (L_divqq3)
886DEFUN   __divqq3
887    mov     r_sign, r_divd
888    eor     r_sign, r_div
889    sbrc    r_div, 7
890    neg     r_div
891    sbrc    r_divd, 7
892    neg     r_divd
893    XCALL   __divqq_helper
894    lsr     r_quo
895    sbrc    r_sign, 7   ; negate result if needed
896    neg     r_quo
897    ret
898ENDF __divqq3
899#endif  /* L_divqq3 */
900
901#if defined (L_udivuqq3)
902DEFUN   __udivuqq3
903    cp      r_divd, r_div
904    brsh    0f
905    XJMP __divqq_helper
906    ;; Result is out of [0, 1)  ==>  Return 1 - eps.
9070:  ldi     r_quo, 0xff
908    ret
909ENDF __udivuqq3
910#endif  /* L_udivuqq3 */
911
912
913#if defined (L_divqq_helper)
914DEFUN   __divqq_helper
915    clr     r_quo           ; clear quotient
916    inc     __zero_reg__    ; init loop counter, used per shift
917__udivuqq3_loop:
918    lsl     r_divd          ; shift dividend
919    brcs    0f              ; dividend overflow
920    cp      r_divd,r_div    ; compare dividend & divisor
921    brcc    0f              ; dividend >= divisor
922    rol     r_quo           ; shift quotient (with CARRY)
923    rjmp    __udivuqq3_cont
9240:
925    sub     r_divd,r_div    ; restore dividend
926    lsl     r_quo           ; shift quotient (without CARRY)
927__udivuqq3_cont:
928    lsl     __zero_reg__    ; shift loop-counter bit
929    brne    __udivuqq3_loop
930    com     r_quo           ; complement result
931                            ; because C flag was complemented in loop
932    ret
933ENDF __divqq_helper
934#endif  /* L_divqq_helper */
935
936#undef  r_divd
937#undef  r_quo
938#undef  r_div
939#undef  r_sign
940
941
942/*******************************************************
943    Fractional Division 16 / 16
944*******************************************************/
945#define r_divdL 26     /* dividend Low */
946#define r_divdH 27     /* dividend Hig */
947#define r_quoL  24     /* quotient Low */
948#define r_quoH  25     /* quotient High */
949#define r_divL  22     /* divisor */
950#define r_divH  23     /* divisor */
951#define r_cnt   21
952
953#if defined (L_divhq3)
954DEFUN   __divhq3
955    mov     r0, r_divdH
956    eor     r0, r_divH
957    sbrs    r_divH, 7
958    rjmp    1f
959    NEG2    r_divL
9601:
961    sbrs    r_divdH, 7
962    rjmp    2f
963    NEG2    r_divdL
9642:
965    cp      r_divdL, r_divL
966    cpc     r_divdH, r_divH
967    breq    __divhq3_minus1  ; if equal return -1
968    XCALL   __udivuhq3
969    lsr     r_quoH
970    ror     r_quoL
971    brpl    9f
972    ;; negate result if needed
973    NEG2    r_quoL
9749:
975    ret
976__divhq3_minus1:
977    ldi     r_quoH, 0x80
978    clr     r_quoL
979    ret
980ENDF __divhq3
981#endif  /* defined (L_divhq3) */
982
983#if defined (L_udivuhq3)
984DEFUN   __udivuhq3
985    sub     r_quoH,r_quoH   ; clear quotient and carry
986    ;; FALLTHRU
987ENDF __udivuhq3
988
989DEFUN   __udivuha3_common
990    clr     r_quoL          ; clear quotient
991    ldi     r_cnt,16        ; init loop counter
992__udivuhq3_loop:
993    rol     r_divdL         ; shift dividend (with CARRY)
994    rol     r_divdH
995    brcs    __udivuhq3_ep   ; dividend overflow
996    cp      r_divdL,r_divL  ; compare dividend & divisor
997    cpc     r_divdH,r_divH
998    brcc    __udivuhq3_ep   ; dividend >= divisor
999    rol     r_quoL          ; shift quotient (with CARRY)
1000    rjmp    __udivuhq3_cont
1001__udivuhq3_ep:
1002    sub     r_divdL,r_divL  ; restore dividend
1003    sbc     r_divdH,r_divH
1004    lsl     r_quoL          ; shift quotient (without CARRY)
1005__udivuhq3_cont:
1006    rol     r_quoH          ; shift quotient
1007    dec     r_cnt           ; decrement loop counter
1008    brne    __udivuhq3_loop
1009    com     r_quoL          ; complement result
1010    com     r_quoH          ; because C flag was complemented in loop
1011    ret
1012ENDF __udivuha3_common
1013#endif  /* defined (L_udivuhq3) */
1014
1015/*******************************************************
1016    Fixed Division 8.8 / 8.8
1017*******************************************************/
1018#if defined (L_divha3)
1019DEFUN   __divha3
1020    mov     r0, r_divdH
1021    eor     r0, r_divH
1022    sbrs    r_divH, 7
1023    rjmp    1f
1024    NEG2    r_divL
10251:
1026    sbrs    r_divdH, 7
1027    rjmp    2f
1028    NEG2    r_divdL
10292:
1030    XCALL   __udivuha3
1031    lsr     r_quoH  ; adjust to 7 fractional bits
1032    ror     r_quoL
1033    sbrs    r0, 7   ; negate result if needed
1034    ret
1035    NEG2    r_quoL
1036    ret
1037ENDF __divha3
1038#endif  /* defined (L_divha3) */
1039
1040#if defined (L_udivuha3)
1041DEFUN   __udivuha3
1042    mov     r_quoH, r_divdL
1043    mov     r_divdL, r_divdH
1044    clr     r_divdH
1045    lsl     r_quoH     ; shift quotient into carry
1046    XJMP    __udivuha3_common ; same as fractional after rearrange
1047ENDF __udivuha3
1048#endif  /* defined (L_udivuha3) */
1049
1050#undef  r_divdL
1051#undef  r_divdH
1052#undef  r_quoL
1053#undef  r_quoH
1054#undef  r_divL
1055#undef  r_divH
1056#undef  r_cnt
1057
1058/*******************************************************
1059    Fixed Division 16.16 / 16.16
1060*******************************************************/
1061
1062#define r_arg1L  24    /* arg1 gets passed already in place */
1063#define r_arg1H  25
1064#define r_arg1HL 26
1065#define r_arg1HH 27
1066#define r_divdL  26    /* dividend Low */
1067#define r_divdH  27
1068#define r_divdHL 30
1069#define r_divdHH 31    /* dividend High */
1070#define r_quoL   22    /* quotient Low */
1071#define r_quoH   23
1072#define r_quoHL  24
1073#define r_quoHH  25    /* quotient High */
1074#define r_divL   18    /* divisor Low */
1075#define r_divH   19
1076#define r_divHL  20
1077#define r_divHH  21    /* divisor High */
1078#define r_cnt  __zero_reg__  /* loop count (0 after the loop!) */
1079
1080#if defined (L_divsa3)
1081DEFUN   __divsa3
1082    mov     r0, r_arg1HH
1083    eor     r0, r_divHH
1084    sbrs    r_divHH, 7
1085    rjmp    1f
1086    NEG4    r_divL
10871:
1088    sbrs    r_arg1HH, 7
1089    rjmp    2f
1090    NEG4    r_arg1L
10912:
1092    XCALL   __udivusa3
1093    lsr     r_quoHH ; adjust to 15 fractional bits
1094    ror     r_quoHL
1095    ror     r_quoH
1096    ror     r_quoL
1097    sbrs    r0, 7   ; negate result if needed
1098    ret
1099    ;; negate r_quoL
1100    XJMP    __negsi2
1101ENDF __divsa3
1102#endif  /* defined (L_divsa3) */
1103
1104#if defined (L_udivusa3)
1105DEFUN   __udivusa3
1106    ldi     r_divdHL, 32    ; init loop counter
1107    mov     r_cnt, r_divdHL
1108    clr     r_divdHL
1109    clr     r_divdHH
1110    wmov    r_quoL, r_divdHL
1111    lsl     r_quoHL         ; shift quotient into carry
1112    rol     r_quoHH
1113__udivusa3_loop:
1114    rol     r_divdL         ; shift dividend (with CARRY)
1115    rol     r_divdH
1116    rol     r_divdHL
1117    rol     r_divdHH
1118    brcs    __udivusa3_ep   ; dividend overflow
1119    cp      r_divdL,r_divL  ; compare dividend & divisor
1120    cpc     r_divdH,r_divH
1121    cpc     r_divdHL,r_divHL
1122    cpc     r_divdHH,r_divHH
1123    brcc    __udivusa3_ep   ; dividend >= divisor
1124    rol     r_quoL          ; shift quotient (with CARRY)
1125    rjmp    __udivusa3_cont
1126__udivusa3_ep:
1127    sub     r_divdL,r_divL  ; restore dividend
1128    sbc     r_divdH,r_divH
1129    sbc     r_divdHL,r_divHL
1130    sbc     r_divdHH,r_divHH
1131    lsl     r_quoL          ; shift quotient (without CARRY)
1132__udivusa3_cont:
1133    rol     r_quoH          ; shift quotient
1134    rol     r_quoHL
1135    rol     r_quoHH
1136    dec     r_cnt           ; decrement loop counter
1137    brne    __udivusa3_loop
1138    com     r_quoL          ; complement result
1139    com     r_quoH          ; because C flag was complemented in loop
1140    com     r_quoHL
1141    com     r_quoHH
1142    ret
1143ENDF __udivusa3
1144#endif  /* defined (L_udivusa3) */
1145
1146#undef  r_arg1L
1147#undef  r_arg1H
1148#undef  r_arg1HL
1149#undef  r_arg1HH
1150#undef  r_divdL
1151#undef  r_divdH
1152#undef  r_divdHL
1153#undef  r_divdHH
1154#undef  r_quoL
1155#undef  r_quoH
1156#undef  r_quoHL
1157#undef  r_quoHH
1158#undef  r_divL
1159#undef  r_divH
1160#undef  r_divHL
1161#undef  r_divHH
1162#undef  r_cnt
1163
1164
1165;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1166;; Saturation, 1 Byte
1167;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1168
1169;; First Argument and Return Register
1170#define A0  24
1171
1172#if defined (L_ssabs_1)
1173DEFUN __ssabs_1
1174    sbrs    A0, 7
1175    ret
1176    neg     A0
1177    sbrc    A0,7
1178    dec     A0
1179    ret
1180ENDF __ssabs_1
1181#endif /* L_ssabs_1 */
1182
1183#undef A0
1184
1185
1186
1187;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1188;; Saturation, 2 Bytes
1189;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1190
1191;; First Argument and Return Register
1192#define A0  24
1193#define A1  A0+1
1194
1195#if defined (L_ssneg_2)
1196DEFUN __ssneg_2
1197    NEG2    A0
1198    brvc 0f
1199    sbiw    A0, 1
12000:  ret
1201ENDF __ssneg_2
1202#endif /* L_ssneg_2 */
1203
1204#if defined (L_ssabs_2)
1205DEFUN __ssabs_2
1206    sbrs    A1, 7
1207    ret
1208    XJMP    __ssneg_2
1209ENDF __ssabs_2
1210#endif /* L_ssabs_2 */
1211
1212#undef A0
1213#undef A1
1214
1215
1216
1217;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1218;; Saturation, 4 Bytes
1219;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1220
1221;; First Argument and Return Register
1222#define A0  22
1223#define A1  A0+1
1224#define A2  A0+2
1225#define A3  A0+3
1226
1227#if defined (L_ssneg_4)
1228DEFUN __ssneg_4
1229    XCALL   __negsi2
1230    brvc 0f
1231    ldi     A3, 0x7f
1232    ldi     A2, 0xff
1233    ldi     A1, 0xff
1234    ldi     A0, 0xff
12350:  ret
1236ENDF __ssneg_4
1237#endif /* L_ssneg_4 */
1238
1239#if defined (L_ssabs_4)
1240DEFUN __ssabs_4
1241    sbrs    A3, 7
1242    ret
1243    XJMP    __ssneg_4
1244ENDF __ssabs_4
1245#endif /* L_ssabs_4 */
1246
1247#undef A0
1248#undef A1
1249#undef A2
1250#undef A3
1251
1252
1253
1254;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1255;; Saturation, 8 Bytes
1256;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1257
1258;; First Argument and Return Register
1259#define A0  18
1260#define A1  A0+1
1261#define A2  A0+2
1262#define A3  A0+3
1263#define A4  A0+4
1264#define A5  A0+5
1265#define A6  A0+6
1266#define A7  A0+7
1267
1268#if defined (L_clr_8)
1269FALIAS __usneguta2
1270FALIAS __usneguda2
1271FALIAS __usnegudq2
1272
1273;; Clear Carry and all Bytes
1274DEFUN __clr_8
1275    ;; Clear Carry and set Z
1276    sub     A7, A7
1277    ;; FALLTHRU
1278ENDF  __clr_8
1279;; Propagate Carry to all Bytes, Carry unaltered
1280DEFUN __sbc_8
1281    sbc     A7, A7
1282    sbc     A6, A6
1283    wmov    A4, A6
1284    wmov    A2, A6
1285    wmov    A0, A6
1286    ret
1287ENDF __sbc_8
1288#endif /* L_clr_8 */
1289
1290#if defined (L_ssneg_8)
1291FALIAS __ssnegta2
1292FALIAS __ssnegda2
1293FALIAS __ssnegdq2
1294
1295DEFUN __ssneg_8
1296    XCALL   __negdi2
1297    brvc 0f
1298    ;; A[] = 0x7fffffff
1299    sec
1300    XCALL   __sbc_8
1301    ldi     A7, 0x7f
13020:  ret
1303ENDF __ssneg_8
1304#endif /* L_ssneg_8 */
1305
1306#if defined (L_ssabs_8)
1307FALIAS __ssabsta2
1308FALIAS __ssabsda2
1309FALIAS __ssabsdq2
1310
1311DEFUN __ssabs_8
1312    sbrs    A7, 7
1313    ret
1314    XJMP    __ssneg_8
1315ENDF __ssabs_8
1316#endif /* L_ssabs_8 */
1317
1318;; Second Argument
1319#define B0  10
1320#define B1  B0+1
1321#define B2  B0+2
1322#define B3  B0+3
1323#define B4  B0+4
1324#define B5  B0+5
1325#define B6  B0+6
1326#define B7  B0+7
1327
1328#if defined (L_usadd_8)
1329FALIAS __usadduta3
1330FALIAS __usadduda3
1331FALIAS __usaddudq3
1332
1333DEFUN __usadd_8
1334    XCALL   __adddi3
1335    brcs 0f
1336    ret
13370:  ;; A[] = 0xffffffff
1338    XJMP    __sbc_8
1339ENDF __usadd_8
1340#endif /* L_usadd_8 */
1341
1342#if defined (L_ussub_8)
1343FALIAS __ussubuta3
1344FALIAS __ussubuda3
1345FALIAS __ussubudq3
1346
1347DEFUN __ussub_8
1348    XCALL   __subdi3
1349    brcs 0f
1350    ret
13510:  ;; A[] = 0
1352    XJMP    __clr_8
1353ENDF __ussub_8
1354#endif /* L_ussub_8 */
1355
1356#if defined (L_ssadd_8)
1357FALIAS __ssaddta3
1358FALIAS __ssaddda3
1359FALIAS __ssadddq3
1360
1361DEFUN __ssadd_8
1362    XCALL   __adddi3
1363    brvc 0f
1364    ;; A = (B >= 0) ? INT64_MAX : INT64_MIN
1365    cpi     B7, 0x80
1366    XCALL   __sbc_8
1367    subi    A7, 0x80
13680:  ret
1369ENDF __ssadd_8
1370#endif /* L_ssadd_8 */
1371
1372#if defined (L_sssub_8)
1373FALIAS __sssubta3
1374FALIAS __sssubda3
1375FALIAS __sssubdq3
1376
1377DEFUN __sssub_8
1378    XCALL   __subdi3
1379    brvc 0f
1380    ;; A = (B < 0) ? INT64_MAX : INT64_MIN
1381    ldi     A7, 0x7f
1382    cp      A7, B7
1383    XCALL   __sbc_8
1384    subi    A7, 0x80
13850:  ret
1386ENDF __sssub_8
1387#endif /* L_sssub_8 */
1388
1389#undef A0
1390#undef A1
1391#undef A2
1392#undef A3
1393#undef A4
1394#undef A5
1395#undef A6
1396#undef A7
1397#undef B0
1398#undef B1
1399#undef B2
1400#undef B3
1401#undef B4
1402#undef B5
1403#undef B6
1404#undef B7
1405
1406
1407;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1408;; Rounding Helpers
1409;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1410
1411#ifdef L_mask1
1412
1413#define AA 24
1414#define CC 25
1415
1416;; R25 = 1 << (R24 & 7)
1417;; CC  = 1 << (AA  & 7)
1418;; Clobbers: None
1419DEFUN __mask1
1420    ;; CC = 2 ^ AA.1
1421    ldi     CC, 1 << 2
1422    sbrs    AA, 1
1423    ldi     CC, 1 << 0
1424    ;; CC *= 2 ^ AA.0
1425    sbrc    AA, 0
1426    lsl     CC
1427    ;; CC *= 2 ^ AA.2
1428    sbrc    AA, 2
1429    swap    CC
1430    ret
1431ENDF __mask1
1432
1433#undef AA
1434#undef CC
1435#endif /* L_mask1 */
1436
1437;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1438
1439;; The rounding point. Any bits smaller than
1440;; 2^{-RP} will be cleared.
1441#define RP R24
1442
1443#define A0 22
1444#define A1 A0 + 1
1445
1446#define C0 24
1447#define C1 C0 + 1
1448
1449;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1450;; Rounding, 1 Byte
1451;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1452
1453#ifdef L_roundqq3
1454
1455;; R24 = round (R22, R24)
1456;; Clobbers: R22, __tmp_reg__
1457DEFUN  __roundqq3
1458    mov     __tmp_reg__, C1
1459    subi    RP, __QQ_FBIT__ - 1
1460    neg     RP
1461    ;; R25 = 1 << RP  (Total offset is FBIT-1 - RP)
1462    XCALL   __mask1
1463    mov     C0, C1
1464    ;; Add-Saturate 2^{-RP-1}
1465    add     A0, C0
1466    brvc 0f
1467    ldi     C0, 0x7f
1468    rjmp 9f
14690:  ;; Mask out bits beyond RP
1470    lsl     C0
1471    neg     C0
1472    and     C0, A0
14739:  mov     C1, __tmp_reg__
1474    ret
1475ENDF  __roundqq3
1476#endif /* L_roundqq3 */
1477
1478#ifdef L_rounduqq3
1479
1480;; R24 = round (R22, R24)
1481;; Clobbers: R22, __tmp_reg__
1482DEFUN  __rounduqq3
1483    mov     __tmp_reg__, C1
1484    subi    RP, __UQQ_FBIT__ - 1
1485    neg     RP
1486    ;; R25 = 1 << RP  (Total offset is FBIT-1 - RP)
1487    XCALL   __mask1
1488    mov     C0, C1
1489    ;; Add-Saturate 2^{-RP-1}
1490    add     A0, C0
1491    brcc 0f
1492    ldi     C0, 0xff
1493    rjmp 9f
14940:  ;; Mask out bits beyond RP
1495    lsl     C0
1496    neg     C0
1497    and     C0, A0
14989:  mov     C1, __tmp_reg__
1499    ret
1500ENDF  __rounduqq3
1501#endif /* L_rounduqq3 */
1502
1503;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1504;; Rounding, 2 Bytes
1505;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1506
1507#ifdef L_addmask_2
1508
1509;; [ R25:R24 =  1 << (R24 & 15)
1510;;   R23:R22 += 1 << (R24 & 15) ]
1511;; SREG is set according to the addition
1512DEFUN __addmask_2
1513    ;; R25 = 1 << (R24 & 7)
1514    XCALL   __mask1
1515    cpi     RP, 1 << 3
1516    sbc     C0, C0
1517    ;; Swap C0 and C1 if RP.3 was set
1518    and     C0, C1
1519    eor     C1, C0
1520    ;; Finally, add the power-of-two:  A[] += C[]
1521    add     A0, C0
1522    adc     A1, C1
1523    ret
1524ENDF  __addmask_2
1525#endif /* L_addmask_2 */
1526
1527#ifdef L_round_s2
1528
1529;; R25:R24 = round (R23:R22, R24)
1530;; Clobbers: R23, R22
1531DEFUN  __roundhq3
1532    subi    RP, __HQ_FBIT__ - __HA_FBIT__
1533ENDF   __roundhq3
1534DEFUN  __roundha3
1535    subi    RP, __HA_FBIT__ - 1
1536    neg     RP
1537    ;; [ R25:R24  = 1 << (FBIT-1 - RP)
1538    ;;   R23:R22 += 1 << (FBIT-1 - RP) ]
1539    XCALL   __addmask_2
1540    XJMP    __round_s2_const
1541ENDF  __roundha3
1542
1543#endif /* L_round_s2 */
1544
1545#ifdef L_round_u2
1546
1547;; R25:R24 = round (R23:R22, R24)
1548;; Clobbers: R23, R22
1549DEFUN  __rounduhq3
1550    subi    RP, __UHQ_FBIT__ - __UHA_FBIT__
1551ENDF   __rounduhq3
1552DEFUN  __rounduha3
1553    subi    RP, __UHA_FBIT__ - 1
1554    neg     RP
1555    ;; [ R25:R24  = 1 << (FBIT-1 - RP)
1556    ;;   R23:R22 += 1 << (FBIT-1 - RP) ]
1557    XCALL   __addmask_2
1558    XJMP    __round_u2_const
1559ENDF  __rounduha3
1560
1561#endif /* L_round_u2 */
1562
1563
1564#ifdef L_round_2_const
1565
1566;; Helpers for 2 byte wide rounding
1567
1568DEFUN  __round_s2_const
1569    brvc 2f
1570    ldi     C1, 0x7f
1571    rjmp 1f
1572    ;; FALLTHRU (Barrier)
1573ENDF  __round_s2_const
1574
1575DEFUN __round_u2_const
1576    brcc 2f
1577    ldi     C1, 0xff
15781:
1579    ldi     C0, 0xff
1580    rjmp 9f
15812:
1582    ;; Saturation is performed now.
1583    ;; Currently, we have C[] = 2^{-RP-1}
1584    ;; C[] = 2^{-RP}
1585    lsl     C0
1586    rol     C1
1587    ;;
1588    NEG2    C0
1589    ;; Clear the bits beyond the rounding point.
1590    and     C0, A0
1591    and     C1, A1
15929:  ret
1593ENDF  __round_u2_const
1594
1595#endif /* L_round_2_const */
1596
1597#undef A0
1598#undef A1
1599#undef C0
1600#undef C1
1601
1602;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1603;; Rounding, 4 Bytes
1604;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1605
1606#define A0 18
1607#define A1 A0 + 1
1608#define A2 A0 + 2
1609#define A3 A0 + 3
1610
1611#define C0 22
1612#define C1 C0 + 1
1613#define C2 C0 + 2
1614#define C3 C0 + 3
1615
1616#ifdef L_addmask_4
1617
1618;; [ R25:R22 =  1 << (R24 & 31)
1619;;   R21:R18 += 1 << (R24 & 31) ]
1620;; SREG is set according to the addition
1621DEFUN __addmask_4
1622    ;; R25 = 1 << (R24 & 7)
1623    XCALL   __mask1
1624    cpi     RP, 1 << 4
1625    sbc     C0, C0
1626    sbc     C1, C1
1627    ;; Swap C2 with C3 if RP.3 is not set
1628    cpi     RP, 1 << 3
1629    sbc     C2, C2
1630    and     C2, C3
1631    eor     C3, C2
1632    ;; Swap C3:C2 with C1:C0 if RP.4 is not set
1633    and     C0, C2  $  eor     C2, C0
1634    and     C1, C3  $  eor     C3, C1
1635    ;; Finally, add the power-of-two:  A[] += C[]
1636    add     A0, C0
1637    adc     A1, C1
1638    adc     A2, C2
1639    adc     A3, C3
1640    ret
1641ENDF  __addmask_4
1642#endif /* L_addmask_4 */
1643
1644#ifdef L_round_s4
1645
1646;; R25:R22 = round (R21:R18, R24)
1647;; Clobbers: R18...R21
1648DEFUN  __roundsq3
1649    subi    RP, __SQ_FBIT__ - __SA_FBIT__
1650ENDF   __roundsq3
1651DEFUN  __roundsa3
1652    subi    RP, __SA_FBIT__ - 1
1653    neg     RP
1654    ;; [ R25:R22  = 1 << (FBIT-1 - RP)
1655    ;;   R21:R18 += 1 << (FBIT-1 - RP) ]
1656    XCALL   __addmask_4
1657    XJMP    __round_s4_const
1658ENDF  __roundsa3
1659
1660#endif /* L_round_s4 */
1661
1662#ifdef L_round_u4
1663
1664;; R25:R22 = round (R21:R18, R24)
1665;; Clobbers: R18...R21
1666DEFUN  __roundusq3
1667    subi    RP, __USQ_FBIT__ - __USA_FBIT__
1668ENDF   __roundusq3
1669DEFUN  __roundusa3
1670    subi    RP, __USA_FBIT__ - 1
1671    neg     RP
1672    ;; [ R25:R22  = 1 << (FBIT-1 - RP)
1673    ;;   R21:R18 += 1 << (FBIT-1 - RP) ]
1674    XCALL   __addmask_4
1675    XJMP    __round_u4_const
1676ENDF  __roundusa3
1677
1678#endif /* L_round_u4 */
1679
1680
1681#ifdef L_round_4_const
1682
1683;; Helpers for 4 byte wide rounding
1684
1685DEFUN  __round_s4_const
1686    brvc 2f
1687    ldi     C3, 0x7f
1688    rjmp 1f
1689    ;; FALLTHRU (Barrier)
1690ENDF  __round_s4_const
1691
1692DEFUN __round_u4_const
1693    brcc 2f
1694    ldi     C3, 0xff
16951:
1696    ldi     C2, 0xff
1697    ldi     C1, 0xff
1698    ldi     C0, 0xff
1699    rjmp 9f
17002:
1701    ;; Saturation is performed now.
1702    ;; Currently, we have C[] = 2^{-RP-1}
1703    ;; C[] = 2^{-RP}
1704    lsl     C0
1705    rol     C1
1706    rol     C2
1707    rol     C3
1708    XCALL   __negsi2
1709    ;; Clear the bits beyond the rounding point.
1710    and     C0, A0
1711    and     C1, A1
1712    and     C2, A2
1713    and     C3, A3
17149:  ret
1715ENDF  __round_u4_const
1716
1717#endif /* L_round_4_const */
1718
1719#undef A0
1720#undef A1
1721#undef A2
1722#undef A3
1723#undef C0
1724#undef C1
1725#undef C2
1726#undef C3
1727
1728#undef RP
1729
1730;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1731;; Rounding, 8 Bytes
1732;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1733
1734#define RP     16
1735#define FBITm1 31
1736
1737#define C0 18
1738#define C1 C0 + 1
1739#define C2 C0 + 2
1740#define C3 C0 + 3
1741#define C4 C0 + 4
1742#define C5 C0 + 5
1743#define C6 C0 + 6
1744#define C7 C0 + 7
1745
1746#define A0 16
1747#define A1 17
1748#define A2 26
1749#define A3 27
1750#define A4 28
1751#define A5 29
1752#define A6 30
1753#define A7 31
1754
1755
1756#ifdef L_rounddq3
1757;; R25:R18 = round (R25:R18, R16)
1758;; Clobbers: ABI
1759DEFUN  __rounddq3
1760    ldi     FBITm1, __DQ_FBIT__ - 1
1761    clt
1762    XJMP    __round_x8
1763ENDF  __rounddq3
1764#endif /* L_rounddq3 */
1765
1766#ifdef L_roundudq3
1767;; R25:R18 = round (R25:R18, R16)
1768;; Clobbers: ABI
1769DEFUN  __roundudq3
1770    ldi     FBITm1, __UDQ_FBIT__ - 1
1771    set
1772    XJMP    __round_x8
1773ENDF  __roundudq3
1774#endif /* L_roundudq3 */
1775
1776#ifdef L_roundda3
1777;; R25:R18 = round (R25:R18, R16)
1778;; Clobbers: ABI
1779DEFUN  __roundda3
1780    ldi     FBITm1, __DA_FBIT__ - 1
1781    clt
1782    XJMP    __round_x8
1783ENDF  __roundda3
1784#endif /* L_roundda3 */
1785
1786#ifdef L_rounduda3
1787;; R25:R18 = round (R25:R18, R16)
1788;; Clobbers: ABI
1789DEFUN  __rounduda3
1790    ldi     FBITm1, __UDA_FBIT__ - 1
1791    set
1792    XJMP    __round_x8
1793ENDF  __rounduda3
1794#endif /* L_rounduda3 */
1795
1796#ifdef L_roundta3
1797;; R25:R18 = round (R25:R18, R16)
1798;; Clobbers: ABI
1799DEFUN  __roundta3
1800    ldi     FBITm1, __TA_FBIT__ - 1
1801    clt
1802    XJMP    __round_x8
1803ENDF  __roundta3
1804#endif /* L_roundta3 */
1805
1806#ifdef L_rounduta3
1807;; R25:R18 = round (R25:R18, R16)
1808;; Clobbers: ABI
1809DEFUN  __rounduta3
1810    ldi     FBITm1, __UTA_FBIT__ - 1
1811    set
1812    XJMP    __round_x8
1813ENDF  __rounduta3
1814#endif /* L_rounduta3 */
1815
1816
1817#ifdef L_round_x8
1818DEFUN __round_x8
1819    push r16
1820    push r17
1821    push r28
1822    push r29
1823    ;; Compute log2 of addend from rounding point
1824    sub     RP, FBITm1
1825    neg     RP
1826    ;; Move input to work register A[]
1827    push    C0
1828    mov     A1, C1
1829    wmov    A2, C2
1830    wmov    A4, C4
1831    wmov    A6, C6
1832    ;; C[] = 1 << (FBIT-1 - RP)
1833    XCALL   __clr_8
1834    inc     C0
1835    XCALL   __ashldi3
1836    pop     A0
1837    ;; A[] += C[]
1838    add     A0, C0
1839    adc     A1, C1
1840    adc     A2, C2
1841    adc     A3, C3
1842    adc     A4, C4
1843    adc     A5, C5
1844    adc     A6, C6
1845    adc     A7, C7
1846    brts    1f
1847    ;; Signed
1848    brvc    3f
1849    ;; Signed overflow: A[] = 0x7f...
1850    brvs    2f
18511:  ;; Unsigned
1852    brcc    3f
1853    ;; Unsigned overflow: A[] = 0xff...
18542:  ldi     C7, 0xff
1855    ldi     C6, 0xff
1856    wmov    C0, C6
1857    wmov    C2, C6
1858    wmov    C4, C6
1859    bld     C7, 7
1860    rjmp 9f
18613:
1862    ;;  C[] = -C[] - C[]
1863    push    A0
1864    ldi     r16, 1
1865    XCALL   __ashldi3
1866    pop     A0
1867    XCALL   __negdi2
1868    ;; Clear the bits beyond the rounding point.
1869    and     C0, A0
1870    and     C1, A1
1871    and     C2, A2
1872    and     C3, A3
1873    and     C4, A4
1874    and     C5, A5
1875    and     C6, A6
1876    and     C7, A7
18779:  ;; Epilogue
1878    pop r29
1879    pop r28
1880    pop r17
1881    pop r16
1882    ret
1883ENDF  __round_x8
1884
1885#endif /* L_round_x8 */
1886
1887#undef A0
1888#undef A1
1889#undef A2
1890#undef A3
1891#undef A4
1892#undef A5
1893#undef A6
1894#undef A7
1895
1896#undef C0
1897#undef C1
1898#undef C2
1899#undef C3
1900#undef C4
1901#undef C5
1902#undef C6
1903#undef C7
1904
1905#undef RP
1906#undef FBITm1
1907
1908
1909;; Supply implementations / symbols for the bit-banging functions
1910;; __builtin_avr_bitsfx and __builtin_avr_fxbits
1911#ifdef L_ret
1912DEFUN __ret
1913    ret
1914ENDF  __ret
1915#endif /* L_ret */
1916