1dnl Intel Pentium-4 mpn_divexact_1 -- mpn by limb exact division. 2 3dnl Copyright 2001, 2002, 2007 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or 8dnl modify it under the terms of the GNU Lesser General Public License as 9dnl published by the Free Software Foundation; either version 3 of the 10dnl License, or (at your option) any later version. 11dnl 12dnl The GNU MP Library is distributed in the hope that it will be useful, 13dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 14dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15dnl Lesser General Public License for more details. 16dnl 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22 23C P4: 19.0 cycles/limb 24 25 26C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, 27C mp_limb_t divisor); 28C 29C Pairs of movd's are used to avoid unaligned loads. Despite the loads not 30C being on the dependent chain and there being plenty of cycles available, 31C using an unaligned movq on every second iteration measured about 23 c/l. 32C 33C Using divl for size==1 seems a touch quicker than mul-by-inverse. The mul 34C will be about 9+2*4+2*2+10*4+19+12 = 92 cycles latency, though some of 35C that might be hidden by out-of-order execution, whereas divl is around 60. 36C At size==2 an extra 19 for the mul versus 60 for the divl will see the mul 37C faster. 38 39defframe(PARAM_DIVISOR,16) 40defframe(PARAM_SIZE, 12) 41defframe(PARAM_SRC, 8) 42defframe(PARAM_DST, 4) 43 44 TEXT 45 46 ALIGN(16) 47PROLOGUE(mpn_divexact_1) 48deflit(`FRAME',0) 49 50 movl PARAM_SIZE, %edx 51 52 movl PARAM_SRC, %eax 53 54 movl PARAM_DIVISOR, %ecx 55 subl $1, %edx 56 jnz L(two_or_more) 57 58 movl (%eax), %eax 59 xorl %edx, %edx 60 61 divl %ecx 62 movl PARAM_DST, %ecx 63 64 movl %eax, (%ecx) 65 ret 66 67 68L(two_or_more): 69 C eax src 70 C ebx 71 C ecx divisor 72 C edx size-1 73 74 movl %ecx, %eax 75 bsfl %ecx, %ecx C trailing twos 76 77 shrl %cl, %eax C d = divisor without twos 78 movd %eax, %mm6 79 movd %ecx, %mm7 C shift 80 81 shrl %eax C d/2 82 83 andl $127, %eax C d/2, 7 bits 84 85ifdef(`PIC',` 86 LEA( binvert_limb_table, %ecx) 87 movzbl (%eax,%ecx), %eax C inv 8 bits 88',` 89 movzbl binvert_limb_table(%eax), %eax C inv 8 bits 90') 91 92 C 93 94 movd %eax, %mm5 C inv 95 96 movd %eax, %mm0 C inv 97 98 pmuludq %mm5, %mm5 C inv*inv 99 100 C 101 102 pmuludq %mm6, %mm5 C inv*inv*d 103 paddd %mm0, %mm0 C 2*inv 104 105 C 106 107 psubd %mm5, %mm0 C inv = 2*inv - inv*inv*d 108 pxor %mm5, %mm5 109 110 paddd %mm0, %mm5 111 pmuludq %mm0, %mm0 C inv*inv 112 113 pcmpeqd %mm4, %mm4 114 psrlq $32, %mm4 C 0x00000000FFFFFFFF 115 116 C 117 118 pmuludq %mm6, %mm0 C inv*inv*d 119 paddd %mm5, %mm5 C 2*inv 120 121 movl PARAM_SRC, %eax 122 movl PARAM_DST, %ecx 123 pxor %mm1, %mm1 C initial carry limb 124 125 C 126 127 psubd %mm0, %mm5 C inv = 2*inv - inv*inv*d 128 129 ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS 130 pushl %eax FRAME_pushl() 131 movq %mm6, %mm0 132 pmuludq %mm5, %mm0 133 movd %mm0, %eax 134 cmpl $1, %eax 135 popl %eax FRAME_popl()') 136 137 pxor %mm0, %mm0 C initial carry bit 138 139 140C The dependent chain here is as follows. 141C 142C latency 143C psubq s = (src-cbit) - climb 2 144C pmuludq q = s*inverse 8 145C pmuludq prod = q*divisor 8 146C psrlq climb = high(prod) 2 147C -- 148C 20 149C 150C Yet the loop measures 19.0 c/l, so obviously there's something gained 151C there over a straight reading of the chip documentation. 152 153L(top): 154 C eax src, incrementing 155 C ebx 156 C ecx dst, incrementing 157 C edx counter, size-1 iterations 158 C 159 C mm0 carry bit 160 C mm1 carry limb 161 C mm4 0x00000000FFFFFFFF 162 C mm5 inverse 163 C mm6 divisor 164 C mm7 shift 165 166 movd (%eax), %mm2 167 movd 4(%eax), %mm3 168 addl $4, %eax 169 punpckldq %mm3, %mm2 170 171 psrlq %mm7, %mm2 172 pand %mm4, %mm2 C src 173 psubq %mm0, %mm2 C src - cbit 174 175 psubq %mm1, %mm2 C src - cbit - climb 176 movq %mm2, %mm0 177 psrlq $63, %mm0 C new cbit 178 179 pmuludq %mm5, %mm2 C s*inverse 180 movd %mm2, (%ecx) C q 181 addl $4, %ecx 182 183 movq %mm6, %mm1 184 pmuludq %mm2, %mm1 C q*divisor 185 psrlq $32, %mm1 C new climb 186 187 subl $1, %edx 188 jnz L(top) 189 190 191L(done): 192 movd (%eax), %mm2 193 psrlq %mm7, %mm2 C src 194 psubq %mm0, %mm2 C src - cbit 195 196 psubq %mm1, %mm2 C src - cbit - climb 197 198 pmuludq %mm5, %mm2 C s*inverse 199 movd %mm2, (%ecx) C q 200 201 emms 202 ret 203 204EPILOGUE() 205