1dnl IA-64 mpn_divexact_1 -- mpn by limb exact division. 2 3dnl Contributed to the GNU project by Torbjorn Granlund and Kevin Ryde. 4 5dnl Copyright 2003, 2004, 2005, 2010 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of the GNU Lesser General Public License as published 11dnl by the Free Software Foundation; either version 3 of the License, or (at 12dnl your option) any later version. 13 14dnl The GNU MP Library is distributed in the hope that it will be useful, but 15dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 16dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 17dnl License for more details. 18 19dnl You should have received a copy of the GNU Lesser General Public License 20dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 21 22include(`../config.m4') 23 24C cycles/limb 25C Itanium: 16 26C Itanium 2: 8 27 28C INPUT PARAMETERS 29define(`rp', `r32') 30define(`up', `r33') 31define(`n', `r34') 32define(`divisor', `r35') 33 34define(`lshift', `r24') 35define(`rshift', `r25') 36 37C This code is a bit messy, and not as similar to mode1o.asm as desired. 38 39C The critical path during initialization is for computing the inverse of the 40C divisor. Since odd divisors are probably common, we conditionally execute 41C the initial count_traling_zeros code and the downshift. 42 43C Possible improvement: Merge more of the feed-in code into the inverse 44C computation. 45 46ASM_START() 47 .text 48 .align 32 49.Ltab: 50data1 0,0x01, 0,0xAB, 0,0xCD, 0,0xB7, 0,0x39, 0,0xA3, 0,0xC5, 0,0xEF 51data1 0,0xF1, 0,0x1B, 0,0x3D, 0,0xA7, 0,0x29, 0,0x13, 0,0x35, 0,0xDF 52data1 0,0xE1, 0,0x8B, 0,0xAD, 0,0x97, 0,0x19, 0,0x83, 0,0xA5, 0,0xCF 53data1 0,0xD1, 0,0xFB, 0,0x1D, 0,0x87, 0,0x09, 0,0xF3, 0,0x15, 0,0xBF 54data1 0,0xC1, 0,0x6B, 0,0x8D, 0,0x77, 0,0xF9, 0,0x63, 0,0x85, 0,0xAF 55data1 0,0xB1, 0,0xDB, 0,0xFD, 0,0x67, 0,0xE9, 0,0xD3, 0,0xF5, 0,0x9F 56data1 0,0xA1, 0,0x4B, 0,0x6D, 0,0x57, 0,0xD9, 0,0x43, 0,0x65, 0,0x8F 57data1 0,0x91, 0,0xBB, 0,0xDD, 0,0x47, 0,0xC9, 0,0xB3, 0,0xD5, 0,0x7F 58data1 0,0x81, 0,0x2B, 0,0x4D, 0,0x37, 0,0xB9, 0,0x23, 0,0x45, 0,0x6F 59data1 0,0x71, 0,0x9B, 0,0xBD, 0,0x27, 0,0xA9, 0,0x93, 0,0xB5, 0,0x5F 60data1 0,0x61, 0,0x0B, 0,0x2D, 0,0x17, 0,0x99, 0,0x03, 0,0x25, 0,0x4F 61data1 0,0x51, 0,0x7B, 0,0x9D, 0,0x07, 0,0x89, 0,0x73, 0,0x95, 0,0x3F 62data1 0,0x41, 0,0xEB, 0,0x0D, 0,0xF7, 0,0x79, 0,0xE3, 0,0x05, 0,0x2F 63data1 0,0x31, 0,0x5B, 0,0x7D, 0,0xE7, 0,0x69, 0,0x53, 0,0x75, 0,0x1F 64data1 0,0x21, 0,0xCB, 0,0xED, 0,0xD7, 0,0x59, 0,0xC3, 0,0xE5, 0,0x0F 65data1 0,0x11, 0,0x3B, 0,0x5D, 0,0xC7, 0,0x49, 0,0x33, 0,0x55, 0,0xFF 66 67 68PROLOGUE(mpn_divexact_1) 69 .prologue 70 .save ar.lc, r2 71 .body 72 73 {.mmi; add r8 = -1, divisor C M0 74 nop 0 C M1 75 tbit.z p8, p9 = divisor, 0 C I0 76} 77ifdef(`HAVE_ABI_32', 78` addp4 rp = 0, rp C M2 rp extend 79 addp4 up = 0, up C M3 up extend 80 sxt4 n = n') C I1 size extend 81 ;; 82.Lhere: 83 {.mmi; ld8 r20 = [up], 8 C M0 up[0] 84 (p8) andcm r8 = r8, divisor C M1 85 mov r15 = ip C I0 .Lhere 86 ;; 87}{.mii 88 .pred.rel "mutex", p8, p9 89 (p9) mov rshift = 0 C M0 90 (p8) popcnt rshift = r8 C I0 r8 = cnt_lo_zeros(divisor) 91 cmp.eq p6, p10 = 1, n C I1 92 ;; 93}{.mii; add r9 = .Ltab-.Lhere, r15 C M0 94 (p8) shr.u divisor = divisor, rshift C I0 95 nop 0 C I1 96 ;; 97}{.mmi; add n = -4, n C M0 size-1 98 (p10) ld8 r21 = [up], 8 C M1 up[1] 99 mov r14 = 2 C M1 2 100}{.mfi; setf.sig f6 = divisor C M2 divisor 101 mov f9 = f0 C M3 carry FIXME 102 zxt1 r3 = divisor C I1 divisor low byte 103 ;; 104}{.mmi; add r3 = r9, r3 C M0 table offset ip and index 105 sub r16 = 0, divisor C M1 -divisor 106 mov r2 = ar.lc C I0 107}{.mmi; sub lshift = 64, rshift C M2 108 setf.sig f13 = r14 C M3 2 in significand 109 mov r17 = -1 C I1 -1 110 ;; 111}{.mmi; ld1 r3 = [r3] C M0 inverse, 8 bits 112 nop 0 C M1 113 mov ar.lc = n C I0 size-1 loop count 114}{.mmi; setf.sig f12 = r16 C M2 -divisor 115 setf.sig f8 = r17 C M3 -1 116 cmp.eq p7, p0 = -2, n C I1 117 ;; 118}{.mmi; setf.sig f7 = r3 C M2 inverse, 8 bits 119 cmp.eq p8, p0 = -1, n C M0 120 shr.u r23 = r20, rshift C I0 121 ;; 122} 123 124 C f6 divisor 125 C f7 inverse, being calculated 126 C f8 -1, will be -inverse 127 C f9 carry 128 C f12 -divisor 129 C f13 2 130 C f14 scratch 131 132 xmpy.l f14 = f13, f7 C Newton 2*i 133 xmpy.l f7 = f7, f7 C Newton i*i 134 ;; 135 xma.l f7 = f7, f12, f14 C Newton i*i*-d + 2*i, 16 bits 136 ;; 137 setf.sig f10 = r23 C speculative, used iff n = 1 138 xmpy.l f14 = f13, f7 C Newton 2*i 139 shl r22 = r21, lshift C speculative, used iff n > 1 140 xmpy.l f7 = f7, f7 C Newton i*i 141 ;; 142 or r31 = r22, r23 C speculative, used iff n > 1 143 xma.l f7 = f7, f12, f14 C Newton i*i*-d + 2*i, 32 bits 144 shr.u r23 = r21, rshift C speculative, used iff n > 1 145 ;; 146 setf.sig f11 = r31 C speculative, used iff n > 1 147 xmpy.l f14 = f13, f7 C Newton 2*i 148 xmpy.l f7 = f7, f7 C Newton i*i 149 ;; 150 xma.l f7 = f7, f12, f14 C Newton i*i*-d + 2*i, 64 bits 151 152 (p7) br.cond.dptk .Ln2 153 (p10) br.cond.dptk .grt3 154 ;; 155 156.Ln1: xmpy.l f12 = f10, f7 C q = ulimb * inverse 157 br .Lx1 158 159.Ln2: 160 xmpy.l f8 = f7, f8 C -inverse = inverse * -1 161 xmpy.l f12 = f11, f7 C q = ulimb * inverse 162 setf.sig f11 = r23 163 br .Lx2 164 165.grt3: 166 ld8 r21 = [up], 8 C up[2] 167 xmpy.l f8 = f7, f8 C -inverse = inverse * -1 168 ;; 169 shl r22 = r21, lshift 170 ;; 171 xmpy.l f12 = f11, f7 C q = ulimb * inverse 172 ;; 173 or r31 = r22, r23 174 shr.u r23 = r21, rshift 175 ;; 176 setf.sig f11 = r31 177 (p8) br.cond.dptk .Lx3 C branch for n = 3 178 ;; 179 ld8 r21 = [up], 8 180 br .Lent 181 182.Ltop: ld8 r21 = [up], 8 183 xma.l f12 = f9, f8, f10 C q = c * -inverse + si 184 nop.b 0 185 ;; 186.Lent: add r16 = 160, up 187 shl r22 = r21, lshift 188 nop.b 0 189 ;; 190 stf8 [rp] = f12, 8 191 xma.hu f9 = f12, f6, f9 C c = high(q * divisor + c) 192 nop.b 0 193 nop.m 0 194 xmpy.l f10 = f11, f7 C si = ulimb * inverse 195 nop.b 0 196 ;; 197 or r31 = r22, r23 198 shr.u r23 = r21, rshift 199 nop.b 0 200 ;; 201 lfetch [r16] 202 setf.sig f11 = r31 203 br.cloop.sptk.few.clr .Ltop 204 205 206 xma.l f12 = f9, f8, f10 C q = c * -inverse + si 207 ;; 208.Lx3: stf8 [rp] = f12, 8 209 xma.hu f9 = f12, f6, f9 C c = high(q * divisor + c) 210 xmpy.l f10 = f11, f7 C si = ulimb * inverse 211 ;; 212 setf.sig f11 = r23 213 ;; 214 xma.l f12 = f9, f8, f10 C q = c * -inverse + si 215 ;; 216.Lx2: stf8 [rp] = f12, 8 217 xma.hu f9 = f12, f6, f9 C c = high(q * divisor + c) 218 xmpy.l f10 = f11, f7 C si = ulimb * inverse 219 ;; 220 xma.l f12 = f9, f8, f10 C q = c * -inverse + si 221 ;; 222.Lx1: stf8 [rp] = f12, 8 223 mov ar.lc = r2 C I0 224 br.ret.sptk.many b0 225EPILOGUE() 226