1dnl Intel Pentium-4 mpn_modexact_1_odd -- mpn by limb exact remainder. 2 3dnl Copyright 2001, 2002, 2007 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or 8dnl modify it under the terms of the GNU Lesser General Public License as 9dnl published by the Free Software Foundation; either version 3 of the 10dnl License, or (at your option) any later version. 11dnl 12dnl The GNU MP Library is distributed in the hope that it will be useful, 13dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 14dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15dnl Lesser General Public License for more details. 16dnl 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22 23C P4: 19.0 cycles/limb 24 25 26C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size, 27C mp_limb_t divisor); 28C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, 29C mp_limb_t divisor, mp_limb_t carry); 30C 31 32defframe(PARAM_CARRY, 16) 33defframe(PARAM_DIVISOR,12) 34defframe(PARAM_SIZE, 8) 35defframe(PARAM_SRC, 4) 36 37 TEXT 38 39 ALIGN(16) 40PROLOGUE(mpn_modexact_1c_odd) 41deflit(`FRAME',0) 42 43 movd PARAM_CARRY, %mm1 44 jmp L(start_1c) 45 46EPILOGUE() 47 48 49 ALIGN(16) 50PROLOGUE(mpn_modexact_1_odd) 51deflit(`FRAME',0) 52 53 pxor %mm1, %mm1 C carry limb 54L(start_1c): 55 movl PARAM_DIVISOR, %eax 56 57 movd PARAM_DIVISOR, %mm7 58 59 shrl %eax 60 61 andl $127, %eax C d/2, 7 bits 62 63ifdef(`PIC',` 64 LEA( binvert_limb_table, %edx) 65 movzbl (%eax,%edx), %eax C inv 8 bits 66',` 67 movzbl binvert_limb_table(%eax), %eax C inv 8 bits 68') 69 70 C 71 72 movd %eax, %mm6 C inv 73 74 movd %eax, %mm0 C inv 75 76 pmuludq %mm6, %mm6 C inv*inv 77 78 C 79 80 pmuludq %mm7, %mm6 C inv*inv*d 81 paddd %mm0, %mm0 C 2*inv 82 83 C 84 85 psubd %mm6, %mm0 C inv = 2*inv - inv*inv*d 86 pxor %mm6, %mm6 87 88 paddd %mm0, %mm6 89 pmuludq %mm0, %mm0 C inv*inv 90 91 C 92 93 pmuludq %mm7, %mm0 C inv*inv*d 94 paddd %mm6, %mm6 C 2*inv 95 96 97 movl PARAM_SRC, %eax 98 movl PARAM_SIZE, %ecx 99 100 C 101 102 psubd %mm0, %mm6 C inv = 2*inv - inv*inv*d 103 104 ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS 105 pushl %eax FRAME_pushl() 106 movd %mm6, %eax 107 imul PARAM_DIVISOR, %eax 108 cmpl $1, %eax 109 popl %eax FRAME_popl()') 110 111 pxor %mm0, %mm0 C carry bit 112 113 114C The dependent chain here is as follows. 115C 116C latency 117C psubq s = (src-cbit) - climb 2 118C pmuludq q = s*inverse 8 119C pmuludq prod = q*divisor 8 120C psrlq climb = high(prod) 2 121C -- 122C 20 123C 124C Yet the loop measures 19.0 c/l, so obviously there's something gained 125C there over a straight reading of the chip documentation. 126 127L(top): 128 C eax src, incrementing 129 C ebx 130 C ecx counter, limbs 131 C edx 132 C 133 C mm0 carry bit 134 C mm1 carry limb 135 C mm6 inverse 136 C mm7 divisor 137 138 movd (%eax), %mm2 139 addl $4, %eax 140 141 psubq %mm0, %mm2 C src - cbit 142 143 psubq %mm1, %mm2 C src - cbit - climb 144 movq %mm2, %mm0 145 psrlq $63, %mm0 C new cbit 146 147 pmuludq %mm6, %mm2 C s*inverse 148 149 movq %mm7, %mm1 150 pmuludq %mm2, %mm1 C q*divisor 151 psrlq $32, %mm1 C new climb 152 153 subl $1, %ecx 154 jnz L(top) 155 156 157L(done): 158 paddq %mm1, %mm0 159 movd %mm0, %eax 160 emms 161 ret 162 163EPILOGUE() 164