1dnl SPARC v9 mpn_add_n -- Add two limb vectors of the same length > 0 and 2dnl store sum in a third limb vector. 3 4dnl Copyright 2001, 2002, 2003, 2011 Free Software Foundation, Inc. 5 6dnl This file is part of the GNU MP Library. 7 8dnl The GNU MP Library is free software; you can redistribute it and/or modify 9dnl it under the terms of the GNU Lesser General Public License as published 10dnl by the Free Software Foundation; either version 3 of the License, or (at 11dnl your option) any later version. 12 13dnl The GNU MP Library is distributed in the hope that it will be useful, but 14dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 15dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 16dnl License for more details. 17 18dnl You should have received a copy of the GNU Lesser General Public License 19dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 20 21include(`../config.m4') 22 23C cycles/limb 24C UltraSPARC 1&2: 4 25C UltraSPARC 3: 4.5 26 27C Compute carry-out from the most significant bits of u,v, and r, where 28C r=u+v+carry_in, using logic operations. 29 30C This code runs at 4 cycles/limb on UltraSPARC 1 and 2. It has a 4 insn 31C recurrency, and the UltraSPARC 1 and 2 the IE units are 100% saturated. 32C Therefore, it seems futile to try to optimize this any further... 33 34C INPUT PARAMETERS 35define(`rp',`%i0') 36define(`up',`%i1') 37define(`vp',`%i2') 38define(`n',`%i3') 39 40define(`u0',`%l0') 41define(`u1',`%l2') 42define(`u2',`%l4') 43define(`u3',`%l6') 44define(`v0',`%l1') 45define(`v1',`%l3') 46define(`v2',`%l5') 47define(`v3',`%l7') 48 49define(`cy',`%i4') 50 51define(`fanop',`fitod %f0,%f2') dnl A quasi nop running in the FA pipe 52define(`fmnop',`fmuld %f0,%f0,%f4') dnl A quasi nop running in the FM pipe 53 54ASM_START() 55 REGISTER(%g2,#scratch) 56 REGISTER(%g3,#scratch) 57PROLOGUE(mpn_add_nc) 58 save %sp,-160,%sp 59 60 fitod %f0,%f0 C make sure f0 contains small, quiet number 61 subcc n,4,%g0 62 bl,pn %xcc,.Loop0 63 nop 64 b,a L(com) 65EPILOGUE() 66 67PROLOGUE(mpn_add_n) 68 save %sp,-160,%sp 69 70 fitod %f0,%f0 C make sure f0 contains small, quiet number 71 subcc n,4,%g0 72 bl,pn %xcc,.Loop0 73 mov 0,cy 74L(com): 75 ldx [up+0],u0 76 ldx [vp+0],v0 77 add up,32,up 78 ldx [up-24],u1 79 ldx [vp+8],v1 80 add vp,32,vp 81 ldx [up-16],u2 82 ldx [vp-16],v2 83 ldx [up-8],u3 84 ldx [vp-8],v3 85 subcc n,8,n 86 add u0,v0,%g1 C main add 87 add %g1,cy,%g4 C carry add 88 or u0,v0,%g2 89 bl,pn %xcc,.Lend4567 90 fanop 91 b,a .Loop 92 93 .align 16 94C START MAIN LOOP 95.Loop: andn %g2,%g4,%g2 96 and u0,v0,%g3 97 ldx [up+0],u0 98 fanop 99C -- 100 or %g3,%g2,%g2 101 ldx [vp+0],v0 102 add up,32,up 103 fanop 104C -- 105 srlx %g2,63,cy 106 add u1,v1,%g1 107 stx %g4,[rp+0] 108 fanop 109C -- 110 add %g1,cy,%g4 111 or u1,v1,%g2 112 fmnop 113 fanop 114C -- 115 andn %g2,%g4,%g2 116 and u1,v1,%g3 117 ldx [up-24],u1 118 fanop 119C -- 120 or %g3,%g2,%g2 121 ldx [vp+8],v1 122 add vp,32,vp 123 fanop 124C -- 125 srlx %g2,63,cy 126 add u2,v2,%g1 127 stx %g4,[rp+8] 128 fanop 129C -- 130 add %g1,cy,%g4 131 or u2,v2,%g2 132 fmnop 133 fanop 134C -- 135 andn %g2,%g4,%g2 136 and u2,v2,%g3 137 ldx [up-16],u2 138 fanop 139C -- 140 or %g3,%g2,%g2 141 ldx [vp-16],v2 142 add rp,32,rp 143 fanop 144C -- 145 srlx %g2,63,cy 146 add u3,v3,%g1 147 stx %g4,[rp-16] 148 fanop 149C -- 150 add %g1,cy,%g4 151 or u3,v3,%g2 152 fmnop 153 fanop 154C -- 155 andn %g2,%g4,%g2 156 and u3,v3,%g3 157 ldx [up-8],u3 158 fanop 159C -- 160 or %g3,%g2,%g2 161 subcc n,4,n 162 ldx [vp-8],v3 163 fanop 164C -- 165 srlx %g2,63,cy 166 add u0,v0,%g1 167 stx %g4,[rp-8] 168 fanop 169C -- 170 add %g1,cy,%g4 171 or u0,v0,%g2 172 bge,pt %xcc,.Loop 173 fanop 174C END MAIN LOOP 175.Lend4567: 176 andn %g2,%g4,%g2 177 and u0,v0,%g3 178 or %g3,%g2,%g2 179 srlx %g2,63,cy 180 add u1,v1,%g1 181 stx %g4,[rp+0] 182 add %g1,cy,%g4 183 or u1,v1,%g2 184 andn %g2,%g4,%g2 185 and u1,v1,%g3 186 or %g3,%g2,%g2 187 srlx %g2,63,cy 188 add u2,v2,%g1 189 stx %g4,[rp+8] 190 add %g1,cy,%g4 191 or u2,v2,%g2 192 andn %g2,%g4,%g2 193 and u2,v2,%g3 194 or %g3,%g2,%g2 195 add rp,32,rp 196 srlx %g2,63,cy 197 add u3,v3,%g1 198 stx %g4,[rp-16] 199 add %g1,cy,%g4 200 or u3,v3,%g2 201 andn %g2,%g4,%g2 202 and u3,v3,%g3 203 or %g3,%g2,%g2 204 srlx %g2,63,cy 205 stx %g4,[rp-8] 206 207 addcc n,4,n 208 bz,pn %xcc,.Lret 209 fanop 210 211.Loop0: ldx [up],u0 212 add up,8,up 213 ldx [vp],v0 214 add vp,8,vp 215 add rp,8,rp 216 subcc n,1,n 217 add u0,v0,%g1 218 or u0,v0,%g2 219 add %g1,cy,%g4 220 and u0,v0,%g3 221 andn %g2,%g4,%g2 222 stx %g4,[rp-8] 223 or %g3,%g2,%g2 224 bnz,pt %xcc,.Loop0 225 srlx %g2,63,cy 226 227.Lret: mov cy,%i0 228 ret 229 restore 230EPILOGUE() 231