1dnl HP-PA 1.1 mpn_submul_1 -- Multiply a limb vector with a limb and subtract 2dnl the result from a second limb vector. 3 4dnl Copyright 1992, 1993, 1994, 2000, 2001, 2002 Free Software Foundation, 5dnl Inc. 6 7dnl This file is part of the GNU MP Library. 8 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of the GNU Lesser General Public License as published 11dnl by the Free Software Foundation; either version 3 of the License, or (at 12dnl your option) any later version. 13 14dnl The GNU MP Library is distributed in the hope that it will be useful, but 15dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 16dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 17dnl License for more details. 18 19dnl You should have received a copy of the GNU Lesser General Public License 20dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 21 22include(`../config.m4') 23 24C INPUT PARAMETERS 25C res_ptr r26 26C s1_ptr r25 27C size r24 28C s2_limb r23 29 30C This runs at 12 cycles/limb on a PA7000. With the used instructions, it can 31C not become faster due to data cache contention after a store. On the PA7100 32C it runs at 11 cycles/limb. 33 34C There are some ideas described in mul_1.asm that applies to this code too. 35 36C It seems possible to make this run as fast as mpn_addmul_1, if we use 37C sub,>>= %r29,%r19,%r22 38C addi 1,%r28,%r28 39C but that requires reworking the hairy software pipeline... 40 41ASM_START() 42PROLOGUE(mpn_submul_1) 43C .callinfo frame=64,no_calls 44 45 ldo 64(%r30),%r30 46 fldws,ma 4(%r25),%fr5 47 stw %r23,-16(%r30) C move s2_limb ... 48 addib,= -1,%r24,L(just_one_limb) 49 fldws -16(%r30),%fr4 C ... into fr4 50 add %r0,%r0,%r0 C clear carry 51 xmpyu %fr4,%fr5,%fr6 52 fldws,ma 4(%r25),%fr7 53 fstds %fr6,-16(%r30) 54 xmpyu %fr4,%fr7,%fr8 55 ldw -12(%r30),%r19 C least significant limb in product 56 ldw -16(%r30),%r28 57 58 fstds %fr8,-16(%r30) 59 addib,= -1,%r24,L(end) 60 ldw -12(%r30),%r1 61 62C Main loop 63LDEF(loop) 64 ldws 0(%r26),%r29 65 fldws,ma 4(%r25),%fr5 66 sub %r29,%r19,%r22 67 add %r22,%r19,%r0 68 stws,ma %r22,4(%r26) 69 addc %r28,%r1,%r19 70 xmpyu %fr4,%fr5,%fr6 71 ldw -16(%r30),%r28 72 fstds %fr6,-16(%r30) 73 addc %r0,%r28,%r28 74 addib,<> -1,%r24,L(loop) 75 ldw -12(%r30),%r1 76 77LDEF(end) 78 ldw 0(%r26),%r29 79 sub %r29,%r19,%r22 80 add %r22,%r19,%r0 81 stws,ma %r22,4(%r26) 82 addc %r28,%r1,%r19 83 ldw -16(%r30),%r28 84 ldws 0(%r26),%r29 85 addc %r0,%r28,%r28 86 sub %r29,%r19,%r22 87 add %r22,%r19,%r0 88 stws,ma %r22,4(%r26) 89 addc %r0,%r28,%r28 90 bv 0(%r2) 91 ldo -64(%r30),%r30 92 93LDEF(just_one_limb) 94 xmpyu %fr4,%fr5,%fr6 95 ldw 0(%r26),%r29 96 fstds %fr6,-16(%r30) 97 ldw -12(%r30),%r1 98 ldw -16(%r30),%r28 99 sub %r29,%r1,%r22 100 add %r22,%r1,%r0 101 stw %r22,0(%r26) 102 addc %r0,%r28,%r28 103 bv 0(%r2) 104 ldo -64(%r30),%r30 105EPILOGUE() 106