1dnl SPARC v9 32-bit mpn_addmul_1 -- Multiply a limb vector with a limb and add 2dnl the result to a second limb vector. 3 4dnl Copyright 1998, 2000, 2001, 2003 Free Software Foundation, Inc. 5 6dnl This file is part of the GNU MP Library. 7 8dnl The GNU MP Library is free software; you can redistribute it and/or modify 9dnl it under the terms of the GNU Lesser General Public License as published 10dnl by the Free Software Foundation; either version 3 of the License, or (at 11dnl your option) any later version. 12 13dnl The GNU MP Library is distributed in the hope that it will be useful, but 14dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 15dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 16dnl License for more details. 17 18dnl You should have received a copy of the GNU Lesser General Public License 19dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 20 21include(`../config.m4') 22 23C Algorithm: We use two floating-point multiplies per limb product, with the 24C invariant v operand split into two 16-bit pieces, and the u operand split 25C into 32-bit pieces. We convert the two 48-bit products and transfer them to 26C the integer unit. 27 28C cycles/limb 29C UltraSPARC 1&2: 6.5 30C UltraSPARC 3: ? 31 32C Possible optimizations: 33C 1. Combine 32-bit memory operations into 64-bit operations. Since we're 34C memory bandwidth limited, this could save 1.5 cycles/limb. 35C 2. Unroll the inner loop. Since we already use alternate temporary areas, 36C it is very straightforward to unroll, using an exit branch midways. 37C Unrolling would allow deeper scheduling which could improve speed for L2 38C cache case. 39C 3. For mpn_mul_1: Use more alternating temp areas. The std'es and ldx'es 40C aren't sufficiently apart-scheduled with just two temp areas. 41C 4. Specialize for particular v values. If its upper 16 bits are zero, we 42C could save many operations. 43 44C INPUT PARAMETERS 45C rp i0 46C up i1 47C n i2 48C v i3 49 50define(`FSIZE',224) 51 52ASM_START() 53PROLOGUE(mpn_addmul_1) 54 add %sp, -FSIZE, %sp 55 sethi %hi(0xffff), %g1 56 srl %o3, 16, %g2 57 or %g1, %lo(0xffff), %g1 58 and %o3, %g1, %g1 59 stx %g1, [%sp+104] 60 stx %g2, [%sp+112] 61 ldd [%sp+104], %f6 62 ldd [%sp+112], %f8 63 fxtod %f6, %f6 64 fxtod %f8, %f8 65 ld [%sp+104], %f10 C zero f10 66 67 mov 0, %g3 C cy = 0 68 69define(`fanop', `fitod %f18, %f0') C A quasi nop running in the FA pipe 70 71 add %sp, 160, %o5 C point in scratch area 72 and %o5, -32, %o5 C align at 0 (mod 32) in scratch area 73 74 subcc %o2, 1, %o2 75 ld [%o1], %f11 C read up[i] 76 add %o1, 4, %o1 C up++ 77 bne,pt %icc, .L_two_or_more 78 fxtod %f10, %f2 79 80 fmuld %f2, %f8, %f16 81 fmuld %f2, %f6, %f4 82 fdtox %f16, %f14 83 fdtox %f4, %f12 84 std %f14, [%o5+16] 85 std %f12, [%o5+24] 86 ldx [%o5+16], %g2 C p16 87 ldx [%o5+24], %g1 C p0 88 lduw [%o0], %g5 C read rp[i] 89 b .L1 90 add %o0, -16, %o0 91 92 .align 16 93.L_two_or_more: 94 subcc %o2, 1, %o2 95 ld [%o1], %f11 C read up[i] 96 fmuld %f2, %f8, %f16 97 fmuld %f2, %f6, %f4 98 add %o1, 4, %o1 C up++ 99 bne,pt %icc, .L_three_or_more 100 fxtod %f10, %f2 101 102 fdtox %f16, %f14 103 fdtox %f4, %f12 104 std %f14, [%o5+16] 105 fmuld %f2, %f8, %f16 106 std %f12, [%o5+24] 107 fmuld %f2, %f6, %f4 108 fdtox %f16, %f14 109 fdtox %f4, %f12 110 std %f14, [%o5+0] 111 std %f12, [%o5+8] 112 lduw [%o0], %g5 C read rp[i] 113 ldx [%o5+16], %g2 C p16 114 ldx [%o5+24], %g1 C p0 115 b .L2 116 add %o0, -12, %o0 117 118 .align 16 119.L_three_or_more: 120 subcc %o2, 1, %o2 121 ld [%o1], %f11 C read up[i] 122 fdtox %f16, %f14 123 fdtox %f4, %f12 124 std %f14, [%o5+16] 125 fmuld %f2, %f8, %f16 126 std %f12, [%o5+24] 127 fmuld %f2, %f6, %f4 128 add %o1, 4, %o1 C up++ 129 bne,pt %icc, .L_four_or_more 130 fxtod %f10, %f2 131 132 fdtox %f16, %f14 133 fdtox %f4, %f12 134 std %f14, [%o5+0] 135 fmuld %f2, %f8, %f16 136 std %f12, [%o5+8] 137 fmuld %f2, %f6, %f4 138 fdtox %f16, %f14 139 ldx [%o5+16], %g2 C p16 140 fdtox %f4, %f12 141 ldx [%o5+24], %g1 C p0 142 std %f14, [%o5+16] 143 std %f12, [%o5+24] 144 lduw [%o0], %g5 C read rp[i] 145 b .L3 146 add %o0, -8, %o0 147 148 .align 16 149.L_four_or_more: 150 subcc %o2, 1, %o2 151 ld [%o1], %f11 C read up[i] 152 fdtox %f16, %f14 153 fdtox %f4, %f12 154 std %f14, [%o5+0] 155 fmuld %f2, %f8, %f16 156 std %f12, [%o5+8] 157 fmuld %f2, %f6, %f4 158 add %o1, 4, %o1 C up++ 159 bne,pt %icc, .L_five_or_more 160 fxtod %f10, %f2 161 162 fdtox %f16, %f14 163 ldx [%o5+16], %g2 C p16 164 fdtox %f4, %f12 165 ldx [%o5+24], %g1 C p0 166 std %f14, [%o5+16] 167 fmuld %f2, %f8, %f16 168 std %f12, [%o5+24] 169 fmuld %f2, %f6, %f4 170 add %o1, 4, %o1 C up++ 171 lduw [%o0], %g5 C read rp[i] 172 b .L4 173 add %o0, -4, %o0 174 175 .align 16 176.L_five_or_more: 177 subcc %o2, 1, %o2 178 ld [%o1], %f11 C read up[i] 179 fdtox %f16, %f14 180 ldx [%o5+16], %g2 C p16 181 fdtox %f4, %f12 182 ldx [%o5+24], %g1 C p0 183 std %f14, [%o5+16] 184 fmuld %f2, %f8, %f16 185 std %f12, [%o5+24] 186 fmuld %f2, %f6, %f4 187 add %o1, 4, %o1 C up++ 188 lduw [%o0], %g5 C read rp[i] 189 bne,pt %icc, .Loop 190 fxtod %f10, %f2 191 b,a .L5 192 193C BEGIN MAIN LOOP 194 .align 16 195C -- 0 196.Loop: nop 197 subcc %o2, 1, %o2 198 ld [%o1], %f11 C read up[i] 199 fdtox %f16, %f14 200C -- 1 201 sllx %g2, 16, %g4 C (p16 << 16) 202 add %o0, 4, %o0 C rp++ 203 ldx [%o5+0], %g2 C p16 204 fdtox %f4, %f12 205C -- 2 206 nop 207 add %g1, %g4, %g4 C p = p0 + (p16 << 16) 208 ldx [%o5+8], %g1 C p0 209 fanop 210C -- 3 211 nop 212 add %g3, %g4, %g4 C p += cy 213 std %f14, [%o5+0] 214 fmuld %f2, %f8, %f16 215C -- 4 216 nop 217 add %g5, %g4, %g4 C p += rp[i] 218 std %f12, [%o5+8] 219 fmuld %f2, %f6, %f4 220C -- 5 221 xor %o5, 16, %o5 C alternate scratch variables 222 add %o1, 4, %o1 C up++ 223 stw %g4, [%o0-4] 224 fanop 225C -- 6 226 srlx %g4, 32, %g3 C new cy 227 lduw [%o0], %g5 C read rp[i] 228 bne,pt %icc, .Loop 229 fxtod %f10, %f2 230C END MAIN LOOP 231 232.L5: fdtox %f16, %f14 233 sllx %g2, 16, %g4 C (p16 << 16) 234 ldx [%o5+0], %g2 C p16 235 fdtox %f4, %f12 236 add %g1, %g4, %g4 C p = p0 + (p16 << 16) 237 ldx [%o5+8], %g1 C p0 238 add %g4, %g3, %g4 C p += cy 239 std %f14, [%o5+0] 240 fmuld %f2, %f8, %f16 241 add %g5, %g4, %g4 C p += rp[i] 242 std %f12, [%o5+8] 243 fmuld %f2, %f6, %f4 244 xor %o5, 16, %o5 245 stw %g4, [%o0+0] 246 srlx %g4, 32, %g3 C new cy 247 lduw [%o0+4], %g5 C read rp[i] 248 249.L4: fdtox %f16, %f14 250 sllx %g2, 16, %g4 C (p16 << 16) 251 ldx [%o5+0], %g2 C p16 252 fdtox %f4, %f12 253 add %g1, %g4, %g4 C p = p0 + (p16 << 16) 254 ldx [%o5+8], %g1 C p0 255 add %g3, %g4, %g4 C p += cy 256 std %f14, [%o5+0] 257 add %g5, %g4, %g4 C p += rp[i] 258 std %f12, [%o5+8] 259 xor %o5, 16, %o5 260 stw %g4, [%o0+4] 261 srlx %g4, 32, %g3 C new cy 262 lduw [%o0+8], %g5 C read rp[i] 263 264.L3: sllx %g2, 16, %g4 C (p16 << 16) 265 ldx [%o5+0], %g2 C p16 266 add %g1, %g4, %g4 C p = p0 + (p16 << 16) 267 ldx [%o5+8], %g1 C p0 268 add %g3, %g4, %g4 C p += cy 269 add %g5, %g4, %g4 C p += rp[i] 270 xor %o5, 16, %o5 271 stw %g4, [%o0+8] 272 srlx %g4, 32, %g3 C new cy 273 lduw [%o0+12], %g5 C read rp[i] 274 275.L2: sllx %g2, 16, %g4 C (p16 << 16) 276 ldx [%o5+0], %g2 C p16 277 add %g1, %g4, %g4 C p = p0 + (p16 << 16) 278 ldx [%o5+8], %g1 C p0 279 add %g3, %g4, %g4 C p += cy 280 add %g5, %g4, %g4 C p += rp[i] 281 stw %g4, [%o0+12] 282 srlx %g4, 32, %g3 C new cy 283 lduw [%o0+16], %g5 C read rp[i] 284 285.L1: sllx %g2, 16, %g4 C (p16 << 16) 286 add %g1, %g4, %g4 C p = p0 + (p16 << 16) 287 add %g3, %g4, %g4 C p += cy 288 add %g5, %g4, %g4 C p += rp[i] 289 stw %g4, [%o0+16] 290 srlx %g4, 32, %g3 C new cy 291 292 mov %g3, %o0 293 retl 294 sub %sp, -FSIZE, %sp 295EPILOGUE(mpn_addmul_1) 296