1dnl HP-PA 1.1 mpn_mul_1 -- Multiply a limb vector with a limb and store the 2dnl result in a second limb vector. 3 4dnl Copyright 1992, 1993, 1994, 2000, 2001, 2002 Free Software Foundation, 5dnl Inc. 6 7dnl This file is part of the GNU MP Library. 8 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of the GNU Lesser General Public License as published 11dnl by the Free Software Foundation; either version 3 of the License, or (at 12dnl your option) any later version. 13 14dnl The GNU MP Library is distributed in the hope that it will be useful, but 15dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 16dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 17dnl License for more details. 18 19dnl You should have received a copy of the GNU Lesser General Public License 20dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 21 22include(`../config.m4') 23 24C INPUT PARAMETERS 25C res_ptr r26 26C s1_ptr r25 27C size r24 28C s2_limb r23 29 30C This runs at 9 cycles/limb on a PA7000. With the used instructions, it can 31C not become faster due to data cache contention after a store. On the PA7100 32C it runs at 7 cycles/limb. 33 34C We could use fldds to read two limbs at a time from the S1 array, and that 35C could bring down the times to 8.5 and 6.5 cycles/limb for the PA7000 and 36C PA7100, respectively. We don't do that since it does not seem worth the 37C (alignment) troubles... 38 39C At least the PA7100 is rumored to be able to deal with cache-misses without 40C stalling instruction issue. If this is true, and the cache is actually also 41C lockup-free, we should use a deeper software pipeline, and load from S1 very 42C early! (The loads and stores to -12(sp) will surely be in the cache.) 43 44ASM_START() 45PROLOGUE(mpn_mul_1) 46C .callinfo frame=64,no_calls 47 48 ldo 64(%r30),%r30 49 fldws,ma 4(%r25),%fr5 50 stw %r23,-16(%r30) C move s2_limb ... 51 addib,= -1,%r24,L(just_one_limb) 52 fldws -16(%r30),%fr4 C ... into fr4 53 add %r0,%r0,%r0 C clear carry 54 xmpyu %fr4,%fr5,%fr6 55 fldws,ma 4(%r25),%fr7 56 fstds %fr6,-16(%r30) 57 xmpyu %fr4,%fr7,%fr8 58 ldw -12(%r30),%r19 C least significant limb in product 59 ldw -16(%r30),%r28 60 61 fstds %fr8,-16(%r30) 62 addib,= -1,%r24,L(end) 63 ldw -12(%r30),%r1 64 65C Main loop 66LDEF(loop) 67 fldws,ma 4(%r25),%fr5 68 stws,ma %r19,4(%r26) 69 addc %r28,%r1,%r19 70 xmpyu %fr4,%fr5,%fr6 71 ldw -16(%r30),%r28 72 fstds %fr6,-16(%r30) 73 addib,<> -1,%r24,L(loop) 74 ldw -12(%r30),%r1 75 76LDEF(end) 77 stws,ma %r19,4(%r26) 78 addc %r28,%r1,%r19 79 ldw -16(%r30),%r28 80 stws,ma %r19,4(%r26) 81 addc %r0,%r28,%r28 82 bv 0(%r2) 83 ldo -64(%r30),%r30 84 85LDEF(just_one_limb) 86 xmpyu %fr4,%fr5,%fr6 87 fstds %fr6,-16(%r30) 88 ldw -16(%r30),%r28 89 ldo -64(%r30),%r30 90 bv 0(%r2) 91 fstws %fr6R,0(%r26) 92EPILOGUE() 93