1dnl SPARC v9 32-bit mpn_submul_1 -- Multiply a limb vector with a limb and 2dnl subtract the result from a second limb vector. 3 4dnl Copyright 1998, 2000, 2001, 2003 Free Software Foundation, Inc. 5 6dnl This file is part of the GNU MP Library. 7 8dnl The GNU MP Library is free software; you can redistribute it and/or modify 9dnl it under the terms of the GNU Lesser General Public License as published 10dnl by the Free Software Foundation; either version 3 of the License, or (at 11dnl your option) any later version. 12 13dnl The GNU MP Library is distributed in the hope that it will be useful, but 14dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 15dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 16dnl License for more details. 17 18dnl You should have received a copy of the GNU Lesser General Public License 19dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 20 21include(`../config.m4') 22 23C Algorithm: We use two floating-point multiplies per limb product, with the 24C invariant v operand split into two 16-bit pieces, and the u operand split 25C into 32-bit pieces. We convert the two 48-bit products and transfer them to 26C the integer unit. 27 28C cycles/limb 29C UltraSPARC 1&2: 6.5 30C UltraSPARC 3: ? 31 32C Possible optimizations: 33C 1. Combine 32-bit memory operations into 64-bit operations. Since we're 34C memory bandwidth limited, this could save 1.5 cycles/limb. 35C 2. Unroll the inner loop. Since we already use alternate temporary areas, 36C it is very straightforward to unroll, using an exit branch midways. 37C Unrolling would allow deeper scheduling which could improve speed for L2 38C cache case. 39C 3. For mpn_mul_1: Use more alternating temp areas. The std'es and ldx'es 40C aren't sufficiently apart-scheduled with just two temp areas. 41C 4. Specialize for particular v values. If its upper 16 bits are zero, we 42C could save many operations. 43 44C INPUT PARAMETERS 45C rp i0 46C up i1 47C n i2 48C v i3 49 50define(`FSIZE',224) 51 52ASM_START() 53PROLOGUE(mpn_submul_1) 54 add %sp, -FSIZE, %sp 55 sethi %hi(0xffff), %g1 56 srl %o3, 16, %g2 57 or %g1, %lo(0xffff), %g1 58 and %o3, %g1, %g1 59 stx %g1, [%sp+104] 60 stx %g2, [%sp+112] 61 ldd [%sp+104], %f6 62 ldd [%sp+112], %f8 63 fxtod %f6, %f6 64 fxtod %f8, %f8 65 ld [%sp+104], %f10 C zero f10 66 67 mov 0, %g3 C cy = 0 68 69define(`fanop', `fitod %f18, %f0') C A quasi nop running in the FA pipe 70 71 add %sp, 160, %o5 C point in scratch area 72 and %o5, -32, %o5 C align at 0 (mod 32) in scratch area 73 74 subcc %o2, 1, %o2 75 ld [%o1], %f11 C read up[i] 76 add %o1, 4, %o1 C up++ 77 bne,pt %icc, .L_two_or_more 78 fxtod %f10, %f2 79 80 fmuld %f2, %f8, %f16 81 fmuld %f2, %f6, %f4 82 fdtox %f16, %f14 83 fdtox %f4, %f12 84 std %f14, [%o5+16] 85 std %f12, [%o5+24] 86 ldx [%o5+16], %g2 C p16 87 ldx [%o5+24], %g1 C p0 88 lduw [%o0], %g5 C read rp[i] 89 b .L1 90 add %o0, -16, %o0 91 92 .align 16 93.L_two_or_more: 94 subcc %o2, 1, %o2 95 ld [%o1], %f11 C read up[i] 96 fmuld %f2, %f8, %f16 97 fmuld %f2, %f6, %f4 98 add %o1, 4, %o1 C up++ 99 bne,pt %icc, .L_three_or_more 100 fxtod %f10, %f2 101 102 fdtox %f16, %f14 103 fdtox %f4, %f12 104 std %f14, [%o5+16] 105 fmuld %f2, %f8, %f16 106 std %f12, [%o5+24] 107 fmuld %f2, %f6, %f4 108 fdtox %f16, %f14 109 fdtox %f4, %f12 110 std %f14, [%o5+0] 111 std %f12, [%o5+8] 112 lduw [%o0], %g5 C read rp[i] 113 ldx [%o5+16], %g2 C p16 114 ldx [%o5+24], %g1 C p0 115 b .L2 116 add %o0, -12, %o0 117 118 .align 16 119.L_three_or_more: 120 subcc %o2, 1, %o2 121 ld [%o1], %f11 C read up[i] 122 fdtox %f16, %f14 123 fdtox %f4, %f12 124 std %f14, [%o5+16] 125 fmuld %f2, %f8, %f16 126 std %f12, [%o5+24] 127 fmuld %f2, %f6, %f4 128 add %o1, 4, %o1 C up++ 129 bne,pt %icc, .L_four_or_more 130 fxtod %f10, %f2 131 132 fdtox %f16, %f14 133 fdtox %f4, %f12 134 std %f14, [%o5+0] 135 fmuld %f2, %f8, %f16 136 std %f12, [%o5+8] 137 fmuld %f2, %f6, %f4 138 fdtox %f16, %f14 139 ldx [%o5+16], %g2 C p16 140 fdtox %f4, %f12 141 ldx [%o5+24], %g1 C p0 142 std %f14, [%o5+16] 143 std %f12, [%o5+24] 144 lduw [%o0], %g5 C read rp[i] 145 b .L3 146 add %o0, -8, %o0 147 148 .align 16 149.L_four_or_more: 150 subcc %o2, 1, %o2 151 ld [%o1], %f11 C read up[i] 152 fdtox %f16, %f14 153 fdtox %f4, %f12 154 std %f14, [%o5+0] 155 fmuld %f2, %f8, %f16 156 std %f12, [%o5+8] 157 fmuld %f2, %f6, %f4 158 add %o1, 4, %o1 C up++ 159 bne,pt %icc, .L_five_or_more 160 fxtod %f10, %f2 161 162 fdtox %f16, %f14 163 ldx [%o5+16], %g2 C p16 164 fdtox %f4, %f12 165 ldx [%o5+24], %g1 C p0 166 std %f14, [%o5+16] 167 fmuld %f2, %f8, %f16 168 std %f12, [%o5+24] 169 fmuld %f2, %f6, %f4 170 add %o1, 4, %o1 C up++ 171 lduw [%o0], %g5 C read rp[i] 172 b .L4 173 add %o0, -4, %o0 174 175 .align 16 176.L_five_or_more: 177 subcc %o2, 1, %o2 178 ld [%o1], %f11 C read up[i] 179 fdtox %f16, %f14 180 ldx [%o5+16], %g2 C p16 181 fdtox %f4, %f12 182 ldx [%o5+24], %g1 C p0 183 std %f14, [%o5+16] 184 fmuld %f2, %f8, %f16 185 std %f12, [%o5+24] 186 fmuld %f2, %f6, %f4 187 add %o1, 4, %o1 C up++ 188 lduw [%o0], %g5 C read rp[i] 189 bne,pt %icc, .Loop 190 fxtod %f10, %f2 191 b,a .L5 192 193C BEGIN MAIN LOOP 194 .align 16 195C -- 0 196.Loop: sub %g0, %g3, %g3 197 subcc %o2, 1, %o2 198 ld [%o1], %f11 C read up[i] 199 fdtox %f16, %f14 200C -- 1 201 sllx %g2, 16, %g4 C (p16 << 16) 202 add %o0, 4, %o0 C rp++ 203 ldx [%o5+0], %g2 C p16 204 fdtox %f4, %f12 205C -- 2 206 srl %g3, 0, %g3 C zero most significant 32 bits 207 add %g1, %g4, %g4 C p = p0 + (p16 << 16) 208 ldx [%o5+8], %g1 C p0 209 fanop 210C -- 3 211 nop 212 add %g3, %g4, %g4 C p += cy 213 std %f14, [%o5+0] 214 fmuld %f2, %f8, %f16 215C -- 4 216 nop 217 sub %g5, %g4, %g4 C p += rp[i] 218 std %f12, [%o5+8] 219 fmuld %f2, %f6, %f4 220C -- 5 221 xor %o5, 16, %o5 C alternate scratch variables 222 add %o1, 4, %o1 C up++ 223 stw %g4, [%o0-4] 224 fanop 225C -- 6 226 srlx %g4, 32, %g3 C new cy 227 lduw [%o0], %g5 C read rp[i] 228 bne,pt %icc, .Loop 229 fxtod %f10, %f2 230C END MAIN LOOP 231 232.L5: sub %g0, %g3, %g3 233 fdtox %f16, %f14 234 sllx %g2, 16, %g4 C (p16 << 16) 235 ldx [%o5+0], %g2 C p16 236 fdtox %f4, %f12 237 srl %g3, 0, %g3 C zero most significant 32 bits 238 add %g1, %g4, %g4 C p = p0 + (p16 << 16) 239 ldx [%o5+8], %g1 C p0 240 add %g4, %g3, %g4 C p += cy 241 std %f14, [%o5+0] 242 fmuld %f2, %f8, %f16 243 sub %g5, %g4, %g4 C p += rp[i] 244 std %f12, [%o5+8] 245 fmuld %f2, %f6, %f4 246 xor %o5, 16, %o5 247 stw %g4, [%o0+0] 248 srlx %g4, 32, %g3 C new cy 249 lduw [%o0+4], %g5 C read rp[i] 250 251 sub %g0, %g3, %g3 252.L4: fdtox %f16, %f14 253 sllx %g2, 16, %g4 C (p16 << 16) 254 ldx [%o5+0], %g2 C p16 255 fdtox %f4, %f12 256 srl %g3, 0, %g3 C zero most significant 32 bits 257 add %g1, %g4, %g4 C p = p0 + (p16 << 16) 258 ldx [%o5+8], %g1 C p0 259 add %g3, %g4, %g4 C p += cy 260 std %f14, [%o5+0] 261 sub %g5, %g4, %g4 C p += rp[i] 262 std %f12, [%o5+8] 263 xor %o5, 16, %o5 264 stw %g4, [%o0+4] 265 srlx %g4, 32, %g3 C new cy 266 lduw [%o0+8], %g5 C read rp[i] 267 268 sub %g0, %g3, %g3 269.L3: sllx %g2, 16, %g4 C (p16 << 16) 270 ldx [%o5+0], %g2 C p16 271 srl %g3, 0, %g3 C zero most significant 32 bits 272 add %g1, %g4, %g4 C p = p0 + (p16 << 16) 273 ldx [%o5+8], %g1 C p0 274 add %g3, %g4, %g4 C p += cy 275 sub %g5, %g4, %g4 C p += rp[i] 276 xor %o5, 16, %o5 277 stw %g4, [%o0+8] 278 srlx %g4, 32, %g3 C new cy 279 lduw [%o0+12], %g5 C read rp[i] 280 281 sub %g0, %g3, %g3 282.L2: sllx %g2, 16, %g4 C (p16 << 16) 283 ldx [%o5+0], %g2 C p16 284 srl %g3, 0, %g3 C zero most significant 32 bits 285 add %g1, %g4, %g4 C p = p0 + (p16 << 16) 286 ldx [%o5+8], %g1 C p0 287 add %g3, %g4, %g4 C p += cy 288 sub %g5, %g4, %g4 C p += rp[i] 289 stw %g4, [%o0+12] 290 srlx %g4, 32, %g3 C new cy 291 lduw [%o0+16], %g5 C read rp[i] 292 293 sub %g0, %g3, %g3 294.L1: sllx %g2, 16, %g4 C (p16 << 16) 295 srl %g3, 0, %g3 C zero most significant 32 bits 296 add %g1, %g4, %g4 C p = p0 + (p16 << 16) 297 add %g3, %g4, %g4 C p += cy 298 sub %g5, %g4, %g4 C p += rp[i] 299 stw %g4, [%o0+16] 300 srlx %g4, 32, %g3 C new cy 301 302 sub %g0, %g3, %o0 303 retl 304 sub %sp, -FSIZE, %sp 305EPILOGUE(mpn_submul_1) 306