1dnl HP-PA 2.0 64-bit mpn_sqr_diagonal. 2 3dnl Copyright 2001, 2002, 2003 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of the GNU Lesser General Public License as published 9dnl by the Free Software Foundation; either version 3 of the License, or (at 10dnl your option) any later version. 11 12dnl The GNU MP Library is distributed in the hope that it will be useful, but 13dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 14dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 15dnl License for more details. 16 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20 21dnl This code runs at 7.25 cycles/limb on PA8000 and 7.75 cycles/limb on 22dnl PA8500. The cache would saturate at 5 cycles/limb, so there is some room 23dnl for optimization. 24 25include(`../config.m4') 26 27C INPUT PARAMETERS 28define(`rp',`%r26') 29define(`up',`%r25') 30define(`n',`%r24') 31 32define(`p00',`%r28') 33define(`p32',`%r29') 34define(`p64',`%r31') 35define(`t0',`%r19') 36define(`t1',`%r20') 37 38ifdef(`HAVE_ABI_2_0w', 39` .level 2.0w 40',` .level 2.0 41') 42PROLOGUE(mpn_sqr_diagonal) 43 ldo 128(%r30),%r30 44 45 fldds,ma 8(up),%fr8 46 addib,= -1,n,L(end1) 47 nop 48 fldds,ma 8(up),%fr4 49 xmpyu %fr8l,%fr8r,%fr10 50 fstd %fr10,-120(%r30) 51 xmpyu %fr8r,%fr8r,%fr9 52 fstd %fr9,0(rp) 53 xmpyu %fr8l,%fr8l,%fr11 54 fstd %fr11,8(rp) 55 addib,= -1,n,L(end2) 56 ldo 16(rp),rp 57 58LDEF(loop) 59 fldds,ma 8(up),%fr8 C load next up limb 60 xmpyu %fr4l,%fr4r,%fr6 61 fstd %fr6,-128(%r30) 62 xmpyu %fr4r,%fr4r,%fr5 C multiply in fp regs 63 fstd %fr5,0(rp) 64 xmpyu %fr4l,%fr4l,%fr7 65 fstd %fr7,8(rp) 66 ldd -120(%r30),p32 67 ldd -16(rp),p00 C accumulate in int regs 68 ldd -8(rp),p64 69 depd,z p32,30,31,t0 70 add t0,p00,p00 71 std p00,-16(rp) 72 extrd,u p32,32,33,t1 73 add,dc t1,p64,p64 74 std p64,-8(rp) 75 addib,= -1,n,L(exit) 76 ldo 16(rp),rp 77 78 fldds,ma 8(up),%fr4 79 xmpyu %fr8l,%fr8r,%fr10 80 fstd %fr10,-120(%r30) 81 xmpyu %fr8r,%fr8r,%fr9 82 fstd %fr9,0(rp) 83 xmpyu %fr8l,%fr8l,%fr11 84 fstd %fr11,8(rp) 85 ldd -128(%r30),p32 86 ldd -16(rp),p00 87 ldd -8(rp),p64 88 depd,z p32,30,31,t0 89 add t0,p00,p00 90 std p00,-16(rp) 91 extrd,u p32,32,33,t1 92 add,dc t1,p64,p64 93 std p64,-8(rp) 94 addib,<> -1,n,L(loop) 95 ldo 16(rp),rp 96 97LDEF(end2) 98 xmpyu %fr4l,%fr4r,%fr6 99 fstd %fr6,-128(%r30) 100 xmpyu %fr4r,%fr4r,%fr5 101 fstd %fr5,0(rp) 102 xmpyu %fr4l,%fr4l,%fr7 103 fstd %fr7,8(rp) 104 ldd -120(%r30),p32 105 ldd -16(rp),p00 106 ldd -8(rp),p64 107 depd,z p32,30,31,t0 108 add t0,p00,p00 109 std p00,-16(rp) 110 extrd,u p32,32,33,t1 111 add,dc t1,p64,p64 112 std p64,-8(rp) 113 ldo 16(rp),rp 114 ldd -128(%r30),p32 115 ldd -16(rp),p00 116 ldd -8(rp),p64 117 depd,z p32,30,31,t0 118 add t0,p00,p00 119 std p00,-16(rp) 120 extrd,u p32,32,33,t1 121 add,dc t1,p64,p64 122 std p64,-8(rp) 123 bve (%r2) 124 ldo -128(%r30),%r30 125 126LDEF(exit) 127 xmpyu %fr8l,%fr8r,%fr10 128 fstd %fr10,-120(%r30) 129 xmpyu %fr8r,%fr8r,%fr9 130 fstd %fr9,0(rp) 131 xmpyu %fr8l,%fr8l,%fr11 132 fstd %fr11,8(rp) 133 ldd -128(%r30),p32 134 ldd -16(rp),p00 135 ldd -8(rp),p64 136 depd,z p32,31,32,t0 137 add t0,p00,p00 138 extrd,u p32,31,32,t1 139 add,dc t1,p64,p64 140 add t0,p00,p00 141 add,dc t1,p64,p64 142 std p00,-16(rp) 143 std p64,-8(rp) 144 ldo 16(rp),rp 145 ldd -120(%r30),p32 146 ldd -16(rp),p00 147 ldd -8(rp),p64 148 depd,z p32,31,32,t0 149 add t0,p00,p00 150 extrd,u p32,31,32,t1 151 add,dc t1,p64,p64 152 add t0,p00,p00 153 add,dc t1,p64,p64 154 std p00,-16(rp) 155 std p64,-8(rp) 156 bve (%r2) 157 ldo -128(%r30),%r30 158 159LDEF(end1) 160 xmpyu %fr8l,%fr8r,%fr10 161 fstd %fr10,-128(%r30) 162 xmpyu %fr8r,%fr8r,%fr9 163 fstd %fr9,0(rp) 164 xmpyu %fr8l,%fr8l,%fr11 165 fstd %fr11,8(rp) 166 ldo 16(rp),rp 167 ldd -128(%r30),p32 168 ldd -16(rp),p00 169 ldd -8(rp),p64 170 depd,z p32,31,32,t0 171 add t0,p00,p00 172 extrd,u p32,31,32,t1 173 add,dc t1,p64,p64 174 add t0,p00,p00 175 add,dc t1,p64,p64 176 std p00,-16(rp) 177 std p64,-8(rp) 178 bve (%r2) 179 ldo -128(%r30),%r30 180EPILOGUE(mpn_sqr_diagonal) 181