1dnl IA-64 mpn_divrem_1 and mpn_preinv_divrem_1 -- Divide an mpn number by an 2dnl unnormalized limb. 3 4dnl Contributed to the GNU project by Torbjorn Granlund. 5 6dnl Copyright 2002, 2004, 2005 Free Software Foundation, Inc. 7 8dnl This file is part of the GNU MP Library. 9 10dnl The GNU MP Library is free software; you can redistribute it and/or modify 11dnl it under the terms of the GNU Lesser General Public License as published 12dnl by the Free Software Foundation; either version 3 of the License, or (at 13dnl your option) any later version. 14 15dnl The GNU MP Library is distributed in the hope that it will be useful, but 16dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 17dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 18dnl License for more details. 19 20dnl You should have received a copy of the GNU Lesser General Public License 21dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 22 23include(`../config.m4') 24 25 26C cycles/limb 27C Itanium: 40-42 28C Itanium 2: 29-30 29 30C This was generated by gcc, then the loops were optimized. The preinv entry 31C point was shoehorned into the file. Lots of things outside the loops could 32C be streamlined. It would probably be a good idea to merge the loops for 33C normalized and unnormalized divisor, since the shifting stuff is done for 34C free in parallel with other operations. It would even be possible to merge 35C all loops, if the ld8 were made conditional. 36 37C TODO 38C * Consider delaying inversion for normalized mpn_divrem_1 entry till after 39C computing leading limb. 40C * Inline and interleave limb inversion code with loop setup code. 41 42ASM_START() 43 44C HP's assembler requires these declarations for importing mpn_invert_limb 45 .global mpn_invert_limb 46 .type mpn_invert_limb,@function 47 48C INPUT PARAMETERS 49C rp = r32 50C qxn = r33 51C up = r34 52C n = r35 53C vl = r36 54C vlinv = r37 (preinv only) 55C cnt = r38 (preinv only) 56 57PROLOGUE(mpn_preinv_divrem_1) 58 .prologue 59 .save ar.pfs, r42 60 alloc r42 = ar.pfs, 7, 8, 1, 0 61 .save ar.lc, r44 62 mov r44 = ar.lc 63 .save rp, r41 64 mov r41 = b0 65 .body 66ifdef(`HAVE_ABI_32', 67` addp4 r32 = 0, r32 68 sxt4 r33 = r33 69 addp4 r34 = 0, r34 70 sxt4 r35 = r35 71 ;; 72') 73 mov r40 = r38 74 shladd r34 = r35, 3, r34 75 ;; 76 adds r34 = -8, r34 77 ;; 78 ld8 r39 = [r34], -8 79 ;; 80 81 add r15 = r35, r33 82 ;; 83 mov r8 = r37 84 shladd r32 = r15, 3, r32 C r32 = rp + n + qxn 85 cmp.le p8, p0 = 0, r36 86 ;; 87 adds r32 = -8, r32 C r32 = rp + n + qxn - 1 88 cmp.leu p6, p7 = r36, r39 89 (p8) br.cond.dpnt .Lpunnorm 90 ;; 91 92 (p6) addl r15 = 1, r0 93 (p7) mov r15 = r0 94 ;; 95 (p6) sub r38 = r39, r36 96 (p7) mov r38 = r39 97 st8 [r32] = r15, -8 98 adds r35 = -2, r35 C un -= 2 99 br .Lpn 100 101.Lpunnorm: 102 (p6) add r34 = 8, r34 103 mov r38 = 0 C r = 0 104 shl r36 = r36, r40 105 (p6) br.cond.dptk .Lpu 106 ;; 107 shl r38 = r39, r40 C r = ahigh << cnt 108 cmp.ne p8, p0 = 1, r35 109 st8 [r32] = r0, -8 110 adds r35 = -1, r35 C un-- 111 (p8) br.cond.dpnt .Lpu 112 113 mov r23 = 1 114 ;; 115 setf.sig f6 = r8 116 setf.sig f12 = r23 117 br .L435 118EPILOGUE() 119 120 121PROLOGUE(mpn_divrem_1) 122 .prologue 123 .save ar.pfs, r42 124 alloc r42 = ar.pfs, 5, 8, 1, 0 125 .save ar.lc, r44 126 mov r44 = ar.lc 127 .save rp, r41 128 mov r41 = b0 129 .body 130ifdef(`HAVE_ABI_32', 131` addp4 r32 = 0, r32 132 sxt4 r33 = r33 133 addp4 r34 = 0, r34 134 sxt4 r35 = r35 135 ;; 136') 137 mov r38 = r0 138 add r15 = r35, r33 139 ;; 140 cmp.ne p6, p7 = 0, r15 141 ;; 142 (p7) mov r8 = r0 143 (p7) br.cond.dpnt .Lret 144 shladd r14 = r15, 3, r32 C r14 = rp + n + qxn 145 cmp.le p6, p7 = 0, r36 146 ;; 147 adds r32 = -8, r14 C r32 = rp + n + qxn - 1 148 (p6) br.cond.dpnt .Lunnorm 149 cmp.eq p6, p7 = 0, r35 150 (p6) br.cond.dpnt .L179 151 shladd r14 = r35, 3, r34 152 ;; 153 adds r14 = -8, r14 154 adds r35 = -1, r35 155 ;; 156 ld8 r38 = [r14] 157 ;; 158 cmp.leu p6, p7 = r36, r38 159 ;; 160 (p6) addl r15 = 1, r0 161 (p7) mov r15 = r0 162 ;; 163 st8 [r32] = r15, -8 164 (p6) sub r38 = r38, r36 165 166.L179: 167 mov r45 = r36 168 adds r35 = -1, r35 169 br.call.sptk.many b0 = mpn_invert_limb 170 ;; 171 shladd r34 = r35, 3, r34 172.Lpn: 173 mov r23 = 1 174 ;; 175 setf.sig f6 = r8 176 setf.sig f12 = r23 177 cmp.le p6, p7 = 0, r35 178 mov r40 = 0 179 (p7) br.cond.dpnt .L435 180 setf.sig f10 = r36 181 mov ar.lc = r35 182 setf.sig f7 = r38 183 ;; 184 sub r28 = -1, r36 185C Develop quotient limbs for normalized divisor 186.Loop1: C 00 C q=r18 nh=r38/f7 187 ld8 r20 = [r34], -8 188 xma.hu f11 = f7, f6, f0 189 ;; C 04 190 xma.l f8 = f11, f12, f7 C q = q + nh 191 ;; C 08 192 getf.sig r18 = f8 193 xma.hu f9 = f8, f10, f0 194 xma.l f8 = f8, f10, f0 195 ;; C 12 196 getf.sig r16 = f9 197 C 13 198 getf.sig r15 = f8 199 ;; C 18 200 cmp.ltu p6, p7 = r20, r15 201 sub r15 = r20, r15 202 sub r16 = r38, r16 203 ;; C 19 204 (p6) cmp.ne p8, p9 = 1, r16 C is rH != 0? 205 (p7) cmp.ne p8, p9 = 0, r16 C is rH != 0? 206 (p6) add r16 = -1, r16 207 (p0) cmp.ne.unc p6, p7 = r0, r0 208 ;; C 20 209 (p8) cmp.ltu p6, p7 = r15, r36 210 (p8) sub r15 = r15, r36 211 (p8) add r18 = 1, r18 C q = q + 1; done if: rH > 0 212 ;; C 21 213 .pred.rel "mutex",p6,p7 214 (p6) cmp.ne p8, p9 = 1, r16 C is rH != 0 still? 215 (p7) cmp.ne p8, p9 = 0, r16 C is rH != 0 still? 216 cmp.ltu p6, p7 = r15, r36 C speculative 217 sub r28 = r15, r36 C speculative, just for cmp 218 ;; C 22 219 (p8) cmp.ltu p6, p7 = r28, r36 C redo last cmp if needed 220 (p8) mov r15 = r28 221 (p8) add r18 = 1, r18 C q = q + 1; done if: rH > 0 222 ;; C 23 223 (p6) setf.sig f7 = r15 224 (p7) sub r15 = r15, r36 225 (p7) add r18 = 1, r18 C q = q + 1; done if: rH > 0 226 ;; C 24 227 (p7) setf.sig f7 = r15 228 st8 [r32] = r18, -8 229 mov r38 = r15 230 br.cloop.dptk .Loop1 231 C 29/30 232 br.sptk .L435 233 ;; 234.Lunnorm: 235 mux1 r16 = r36, @rev 236 cmp.eq p6, p7 = 0, r35 237 (p6) br.cond.dpnt .L322 238 shladd r34 = r35, 3, r34 239 ;; 240 adds r34 = -8, r34 241 ;; 242 ld8 r39 = [r34] 243 ;; 244 cmp.leu p6, p7 = r36, r39 245 (p6) br.cond.dptk .L322 246 adds r34 = -8, r34 247 ;; 248 mov r38 = r39 249 ;; 250 cmp.ne p6, p7 = 1, r15 251 st8 [r32] = r0, -8 252 ;; 253 (p7) mov r8 = r38 254 (p7) br.cond.dpnt .Lret 255 adds r35 = -1, r35 256.L322: 257 sub r14 = r0, r16 258 ;; 259 or r14 = r16, r14 260 ;; 261 mov r16 = -8 262 czx1.l r14 = r14 263 ;; 264 shladd r16 = r14, 3, r16 265 ;; 266 shr.u r14 = r36, r16 267 ;; 268 cmp.geu p6, p7 = 15, r14 269 ;; 270 (p7) shr.u r14 = r14, 4 271 (p7) adds r16 = 4, r16 272 ;; 273 cmp.geu p6, p7 = 3, r14 274 ;; 275 (p7) shr.u r14 = r14, 2 276 (p7) adds r16 = 2, r16 277 ;; 278 tbit.nz p6, p7 = r14, 1 279 ;; 280 .pred.rel "mutex",p6,p7 281 (p6) sub r40 = 62, r16 282 (p7) sub r40 = 63, r16 283 ;; 284 shl r45 = r36, r40 285 shl r36 = r36, r40 286 shl r38 = r38, r40 287 br.call.sptk.many b0 = mpn_invert_limb 288 ;; 289.Lpu: 290 mov r23 = 1 291 ;; 292 setf.sig f6 = r8 293 setf.sig f12 = r23 294 cmp.eq p6, p7 = 0, r35 295 (p6) br.cond.dpnt .L435 296 sub r16 = 64, r40 297 adds r35 = -2, r35 298 ;; 299 ld8 r39 = [r34], -8 300 cmp.le p6, p7 = 0, r35 301 ;; 302 shr.u r14 = r39, r16 303 ;; 304 or r38 = r14, r38 305 (p7) br.cond.dpnt .Lend3 306 ;; 307 mov r22 = r16 308 setf.sig f10 = r36 309 setf.sig f7 = r38 310 mov ar.lc = r35 311 ;; 312C Develop quotient limbs for unnormalized divisor 313.Loop3: 314 ld8 r14 = [r34], -8 315 xma.hu f11 = f7, f6, f0 316 ;; 317 xma.l f8 = f11, f12, f7 C q = q + nh 318 ;; 319 getf.sig r18 = f8 320 xma.hu f9 = f8, f10, f0 321 shl r20 = r39, r40 322 xma.l f8 = f8, f10, f0 323 shr.u r24 = r14, r22 324 ;; 325 getf.sig r16 = f9 326 getf.sig r15 = f8 327 or r20 = r24, r20 328 ;; 329 cmp.ltu p6, p7 = r20, r15 330 sub r15 = r20, r15 331 sub r16 = r38, r16 332 ;; 333 (p6) cmp.ne p8, p9 = 1, r16 C is rH != 0? 334 (p7) cmp.ne p8, p9 = 0, r16 C is rH != 0? 335 (p6) add r16 = -1, r16 336 (p0) cmp.ne.unc p6, p7 = r0, r0 337 ;; 338 (p8) cmp.ltu p6, p7 = r15, r36 339 (p8) sub r15 = r15, r36 340 (p8) add r18 = 1, r18 C q = q + 1; done if: rH > 0 341 ;; 342 .pred.rel "mutex",p6,p7 343 (p6) cmp.ne p8, p9 = 1, r16 C is rH != 0 still? 344 (p7) cmp.ne p8, p9 = 0, r16 C is rH != 0 still? 345 cmp.ltu p6, p7 = r15, r36 C speculative 346 sub r28 = r15, r36 C speculative, just for cmp 347 ;; 348 (p8) cmp.ltu p6, p7 = r28, r36 C redo last cmp if needed 349 (p8) mov r15 = r28 350 (p8) add r18 = 1, r18 C q = q + 1; done if: rH > 0 351 ;; 352 (p6) setf.sig f7 = r15 353 (p7) sub r15 = r15, r36 354 (p7) add r18 = 1, r18 C q = q + 1; done if: rH > 0 355 ;; 356 (p7) setf.sig f7 = r15 357 st8 [r32] = r18, -8 358 mov r39 = r14 359 mov r38 = r15 360 br.cloop.dptk .Loop3 361 ;; 362.Lend3: 363 setf.sig f10 = r36 364 setf.sig f7 = r38 365 ;; 366 xma.hu f11 = f7, f6, f0 367 ;; 368 xma.l f8 = f11, f12, f7 C q = q + nh 369 ;; 370 getf.sig r18 = f8 371 xma.hu f9 = f8, f10, f0 372 shl r20 = r39, r40 373 xma.l f8 = f8, f10, f0 374 ;; 375 getf.sig r16 = f9 376 getf.sig r15 = f8 377 ;; 378 cmp.ltu p6, p7 = r20, r15 379 sub r15 = r20, r15 380 sub r16 = r38, r16 381 ;; 382 (p6) cmp.ne p8, p9 = 1, r16 C is rH != 0? 383 (p7) cmp.ne p8, p9 = 0, r16 C is rH != 0? 384 (p6) add r16 = -1, r16 385 (p0) cmp.ne.unc p6, p7 = r0, r0 386 ;; 387 (p8) cmp.ltu p6, p7 = r15, r36 388 (p8) sub r15 = r15, r36 389 (p8) add r18 = 1, r18 C q = q + 1; done if: rH > 0 390 ;; 391 .pred.rel "mutex",p6,p7 392 (p6) cmp.ne p8, p9 = 1, r16 C is rH != 0 still? 393 (p7) cmp.ne p8, p9 = 0, r16 C is rH != 0 still? 394 ;; 395 (p8) sub r15 = r15, r36 396 (p8) add r18 = 1, r18 C q = q + 1; done if: rH > 0 397 ;; 398 cmp.ltu p6, p7 = r15, r36 399 ;; 400 (p7) sub r15 = r15, r36 401 (p7) add r18 = 1, r18 C q = q + 1; done if: rH > 0 402 ;; 403 st8 [r32] = r18, -8 404 mov r38 = r15 405.L435: 406 adds r35 = -1, r33 407 cmp.le p6, p7 = 1, r33 408 (p7) br.cond.dpnt .Lend4 409 ;; 410 setf.sig f7 = r38 411 setf.sig f10 = r36 412 mov ar.lc = r35 413 ;; 414.Loop4: 415 xma.hu f11 = f7, f6, f0 416 ;; 417 xma.l f8 = f11, f12, f7 C q = q + nh 418 ;; 419 getf.sig r18 = f8 420 xma.hu f9 = f8, f10, f0 421 xma.l f8 = f8, f10, f0 422 ;; 423 getf.sig r16 = f9 424 getf.sig r15 = f8 425 ;; 426 cmp.ltu p6, p7 = 0, r15 427 sub r15 = 0, r15 428 sub r16 = r38, r16 429 ;; 430 (p6) cmp.ne p8, p9 = 1, r16 C is rH != 0? 431 (p7) cmp.ne p8, p9 = 0, r16 C is rH != 0? 432 (p6) add r16 = -1, r16 433 (p0) cmp.ne.unc p6, p7 = r0, r0 434 ;; 435 (p8) cmp.ltu p6, p7 = r15, r36 436 (p8) sub r15 = r15, r36 437 (p8) add r18 = 1, r18 C q = q + 1; done if: rH > 0 438 ;; 439 .pred.rel "mutex",p6,p7 440 (p6) cmp.ne p8, p9 = 1, r16 C is rH != 0 still? 441 (p7) cmp.ne p8, p9 = 0, r16 C is rH != 0 still? 442 cmp.ltu p6, p7 = r15, r36 C speculative 443 sub r28 = r15, r36 C speculative, just for cmp 444 ;; 445 (p8) cmp.ltu p6, p7 = r28, r36 C redo last cmp if needed 446 (p8) mov r15 = r28 447 (p8) add r18 = 1, r18 C q = q + 1; done if: rH > 0 448 ;; 449 (p6) setf.sig f7 = r15 450 (p7) sub r15 = r15, r36 451 (p7) add r18 = 1, r18 C q = q + 1; done if: rH > 0 452 ;; 453 (p7) setf.sig f7 = r15 454 st8 [r32] = r18, -8 455 mov r38 = r15 456 br.cloop.dptk .Loop4 457 ;; 458.Lend4: 459 shr.u r8 = r38, r40 460.Lret: 461 mov ar.pfs = r42 462 mov ar.lc = r44 463 mov b0 = r41 464 br.ret.sptk.many b0 465EPILOGUE() 466ASM_END() 467