1dnl IA-64 mpn_divrem_2 -- Divide an n-limb number by a 2-limb number. 2 3dnl Copyright 2004, 2005 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of the GNU Lesser General Public License as published 9dnl by the Free Software Foundation; either version 3 of the License, or (at 10dnl your option) any later version. 11 12dnl The GNU MP Library is distributed in the hope that it will be useful, but 13dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 14dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 15dnl License for more details. 16 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22C cycles/limb 23C Itanium: 63 24C Itanium 2: 46 25 26 27C TODO 28C * Further optimize the loop. We could probably do some more trickery with 29C arithmetic in the FPU, or perhaps use a non-zero addend of xma in more 30C places. 31C * Software pipeline for perhaps 5 saved cycles, around the end and start of 32C the loop. 33C * Schedule code outside of loop better. 34C * Update the comments. They are now using the same name for the same 35C logical quantity. 36C * Handle conditional zeroing of r31 in loop more cleanly. 37C * Inline mpn_invert_limb and schedule its insns across the entire init code. 38C * Ultimately, use 2-limb, or perhaps 3-limb or 4-limb inverse. 39 40define(`qp',`r32') 41define(`qxn',`r33') 42define(`np',`r34') 43define(`nn',`r35') 44define(`dp',`r36') 45 46define(`fnh',`f11') 47define(`fminus1',`f10') 48define(`fd0',`f13') 49define(`fd1',`f14') 50define(`d0',`r39') 51define(`d1',`r36') 52define(`fnl',`f32') 53define(`fdinv',`f12') 54 55define(`R1',`r38') define(`R0',`r37') 56define(`P1',`r28') define(`P0',`r27') 57 58ASM_START() 59 60C HP's assembler requires these declarations for importing mpn_invert_limb 61 .global mpn_invert_limb 62 .type mpn_invert_limb,@function 63 64PROLOGUE(mpn_divrem_2) 65 .prologue 66 .save ar.pfs, r42 67 .save ar.lc, r44 68 .save rp, r41 69ifdef(`HAVE_ABI_32', 70` addp4 qp = 0, qp C M I 71 addp4 np = 0, np C M I 72 addp4 dp = 0, dp C M I 73 zxt4 nn = nn C I 74 zxt4 qxn = qxn C I 75 ;; 76') 77 78 alloc r42 = ar.pfs, 5,8,1,0 C M2 79 ld8 d0 = [dp], 8 C M0M1 d0 80 mov r44 = ar.lc C I0 81 shladd np = nn, 3, np C M I 82 ;; 83 ld8 d1 = [dp] C M0M1 d1 84 mov r41 = b0 C I0 85 add r15 = -8, np C M I 86 add np = -16, np C M I 87 mov r40 = r0 C M I 88 ;; 89 ld8 R1 = [r15] C M0M1 n1 90 ld8 R0 = [r34], -8 C M0M1 n0 91 ;; 92 cmp.ltu p6, p0 = d1, R1 C M I 93 cmp.eq p8, p0 = d1, R1 C M I 94 ;; 95 (p8) cmp.leu p6, p0 = d0, R0 96 cmp.ltu p8, p9 = R0, d0 97 (p6) br.cond.dpnt .L_high_limb_1 C FIXME: inline! 98.L8: 99 100 mov r45 = d1 101 br.call.sptk.many b0 = mpn_invert_limb C FIXME: inline+schedule 102 ;; 103 setf.sig fd1 = d1 C d1 104 setf.sig fd0 = d0 C d0 105 add r14 = r33, r35 C nn + qxn 106 ;; 107 setf.sig fdinv = r8 C dinv 108 mov r9 = -1 109 add r35 = -3, r14 110 ;; 111 setf.sig fminus1 = r9 112 cmp.gt p6, p0 = r0, r35 113 shladd qp = r35, 3, qp 114 mov ar.lc = r35 115 mov r31 = 0 C n0 116 (p6) br.cond.dpnt .Ldone 117 ;; 118 ALIGN(16) 119C *** MAIN LOOP START *** 120.Loop: C 00 121 mov r15 = R0 C nadj = n10 122 cmp.le p14, p15 = 0, R0 C check high bit of R0 123 cmp.le p8, p0 = r33, r35 C dividend limbs remaining? 124 ;; C 01 125 .pred.rel "mutex", p14, p15 126 (p8) ld8 r31 = [r34], -8 C n0 127 (p15) add r15 = d1, R0 C nadj = n10 + d1 128 (p15) add r14 = 1, R1 C nh + (nl:63) 129 (p14) mov r14 = R1 C nh 130 cmp.eq p6, p0 = d1, R1 C nh == d1 131 (p6) br.cond.spnt .L_R1_eq_d1 132 ;; C 02 133 setf.sig f8 = r14 C n2 + (nl:63) 134 setf.sig f15 = r15 C nadj 135 sub r23 = -1, R1 C r23 = ~nh 136 ;; C 03 137 setf.sig fnh = r23 138 setf.sig fnl = R0 139 ;; C 08 140 xma.hu f7 = fdinv, f8, f15 C xh = HI(dinv*(nh-nmask)+nadj) 141 ;; C 12 142 xma.l f7 = f7, fminus1, fnh C nh + xh 143 ;; C 16 144 getf.sig r14 = f7 145 xma.hu f9 = f7, fd1, fnl C xh = HI(q1*d1+nl) 146 xma.l f33 = f7, fd1, fnl C xh = LO(q1*d1+nl) 147 ;; C 20 148 getf.sig r16 = f9 149 sub r24 = d1, R1 150 C 21 151 getf.sig r17 = f33 152 ;; C 25 153 cmp.eq p6, p7 = r16, r24 154 ;; C 26 155 .pred.rel "mutex", p6, p7 156 (p6) xma.l f8 = f7, fminus1, f0 C f8 = -f7 157 (p7) xma.l f8 = f7,fminus1,fminus1 C f8 = -f7-1 158 ;; C 27 159 .pred.rel "mutex", p6, p7 160 (p6) sub r18 = 0, r14 C q = -q1 161 (p7) sub r18 = -1, r14 C q = -q1-1 162 (p6) add r14 = 0, r17 C n1 = xl 163 (p7) add r14 = d1, r17 C n1 = xl + d1 164 ;; C 30 165 xma.hu f9 = fd0, f8, f0 C d0*(-f7-1) = -d0*f7-d0 166 xma.l f35 = fd0, f8, f0 167 ;; C 34 168 getf.sig P1 = f9 C P1 169 C 35 170 getf.sig P0 = f35 C P0 171 ;; 172.L_adj: C 40 173 cmp.ltu p8, p0 = r31, P0 C p8 = cy from low limb 174 cmp.ltu p6, p0 = r14, P1 C p6 = prel cy from high limb 175 sub R0 = r31, P0 176 sub R1 = r14, P1 177 ;; C 41 178 (p8) cmp.eq.or p6, p0 = 0, R1 C p6 = final cy from high limb 179 (p8) add R1 = -1, R1 180 cmp.ne p10, p0 = r0, r0 C clear p10 FIXME: use unc below! 181 cmp.ne p13, p0 = r0, r0 C clear p13 FIXME: use unc below! 182 ;; C 42 183 (p6) add R0 = R0, d0 184 (p6) add R1 = R1, d1 185 (p6) add r18 = -1, r18 C q-- 186 ;; C 43 187 (p6) cmp.ltu p10, p0 = R0, d0 188 (p6) cmp.ltu p0, p13 = R1, d1 189 ;; C 44 190 (p10) cmp.ne.and p0, p13 = -1, R1 C p13 = !cy 191 (p10) add R1 = 1, R1 192 (p13) br.cond.spnt .L_two_too_big C jump if not cy 193 ;; C 45 194 st8 [qp] = r18, -8 195 add r35 = -1, r35 196 mov r31 = 0 C n0, next iteration 197 br.cloop.sptk .Loop 198C *** MAIN LOOP END *** 199 ;; 200.Ldone: 201 mov r8 = r40 202 mov b0 = r41 203 add r21 = 8, r34 204 add r22 = 16, r34 205 ;; 206 st8 [r21] = R0 207 st8 [r22] = R1 208 mov ar.pfs = r42 209 mov ar.lc = r44 210 br.ret.sptk.many b0 211 212.L_high_limb_1: 213 .pred.rel "mutex", p8, p9 214 sub R0 = R0, d0 215 (p8) sub R1 = R1, d1, 1 216 (p9) sub R1 = R1, d1 217 mov r40 = 1 218 br.sptk .L8 219 ;; 220 221.L_two_too_big: 222 add R0 = R0, d0 223 add R1 = R1, d1 224 ;; 225 add r18 = -1, r18 C q-- 226 cmp.ltu p10, p0 = R0, d0 227 ;; 228 (p10) add R1 = 1, R1 229 st8 [qp] = r18, -8 230 add r35 = -1, r35 231 mov r31 = 0 C n0, next iteration 232 br.cloop.sptk .Loop 233 br.sptk .Ldone 234 235.L_R1_eq_d1: 236 add r14 = R0, d1 C r = R0 + d1 237 mov r18 = -1 C q = -1 238 ;; 239 cmp.leu p6, p0 = R0, r14 240 (p6) br.cond.spnt .L20 C jump unless cy 241 ;; 242 sub P1 = r14, d0 243 add R0 = r31, d0 244 ;; 245 cmp.ltu p8, p9 = R0, r31 246 ;; 247 .pred.rel "mutex", p8, p9 248 st8 [qp] = r18, -8 249 (p8) add R1 = r0, P1, 1 C R1 = n1 - P1 - cy 250 (p9) add R1 = r0, P1 C R1 = n1 - P1 251 add r35 = -1, r35 252 mov r31 = 0 C n0, next iteration 253 br.cloop.sptk .Loop 254 br.sptk .Ldone 255 ;; 256.L20: cmp.ne p6, p7 = 0, d0 257 ;; 258 .pred.rel "mutex", p6, p7 259 (p6) add P1 = -1, d0 260 (p7) mov P1 = d0 261 sub P0 = r0, d0 262 br.sptk .L_adj 263EPILOGUE() 264ASM_END() 265