1dnl AMD K6 mpn_modexact_1_odd -- exact division style remainder. 2 3dnl Copyright 2000, 2001, 2002, 2003, 2007 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or 8dnl modify it under the terms of the GNU Lesser General Public License as 9dnl published by the Free Software Foundation; either version 3 of the 10dnl License, or (at your option) any later version. 11dnl 12dnl The GNU MP Library is distributed in the hope that it will be useful, 13dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 14dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15dnl Lesser General Public License for more details. 16dnl 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22 23C K6: 10.0 cycles/limb 24 25 26C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size, 27C mp_limb_t divisor); 28C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, 29C mp_limb_t divisor, mp_limb_t carry); 30C 31C A special case for high<divisor at the end measured only about 4 cycles 32C faster, and so isn't used. 33C 34C A special case for size==1 using a divl rather than the inverse measured 35C only about 5 cycles faster, and so isn't used. When size==1 and 36C high<divisor it can skip a division and be a full 24 cycles faster, but 37C this isn't an important case. 38 39defframe(PARAM_CARRY, 16) 40defframe(PARAM_DIVISOR,12) 41defframe(PARAM_SIZE, 8) 42defframe(PARAM_SRC, 4) 43 44 TEXT 45 46 ALIGN(32) 47PROLOGUE(mpn_modexact_1c_odd) 48deflit(`FRAME',0) 49 50 movl PARAM_DIVISOR, %ecx 51 pushl %esi FRAME_pushl() 52 53 movl PARAM_CARRY, %edx 54 jmp L(start_1c) 55 56EPILOGUE() 57 58 59 ALIGN(16) 60PROLOGUE(mpn_modexact_1_odd) 61deflit(`FRAME',0) 62 63 movl PARAM_DIVISOR, %ecx 64 pushl %esi FRAME_pushl() 65 66 xorl %edx, %edx 67L(start_1c): 68 pushl %edi FRAME_pushl() 69 70 shrl %ecx C d/2 71 movl PARAM_DIVISOR, %esi 72 73 andl $127, %ecx C d/2, 7 bits 74 pushl %ebp FRAME_pushl() 75 76ifdef(`PIC',` 77 LEA( binvert_limb_table, %edi) 78Zdisp( movzbl, 0,(%ecx,%edi), %edi) C inv 8 bits 79',` 80 movzbl binvert_limb_table(%ecx), %edi C inv 8 bits 81') 82 leal (%edi,%edi), %ecx C 2*inv 83 84 imull %edi, %edi C inv*inv 85 86 movl PARAM_SRC, %eax 87 movl PARAM_SIZE, %ebp 88 89 imull %esi, %edi C inv*inv*d 90 91 pushl %ebx FRAME_pushl() 92 leal (%eax,%ebp,4), %ebx C src end 93 94 subl %edi, %ecx C inv = 2*inv - inv*inv*d 95 leal (%ecx,%ecx), %edi C 2*inv 96 97 imull %ecx, %ecx C inv*inv 98 99 movl (%eax), %eax C src low limb 100 negl %ebp C -size 101 102 imull %esi, %ecx C inv*inv*d 103 104 subl %ecx, %edi C inv = 2*inv - inv*inv*d 105 106 ASSERT(e,` C d*inv == 1 mod 2^GMP_LIMB_BITS 107 pushl %eax 108 movl %esi, %eax 109 imull %edi, %eax 110 cmpl $1, %eax 111 popl %eax') 112 113 jmp L(entry) 114 115 116C Rotating the mul to the top of the loop saves 1 cycle, presumably by 117C hiding the loop control under the imul latency. 118C 119C The run time is 10 cycles, but decoding is only 9 (and the dependent chain 120C only 8). It's not clear how to get down to 9 cycles. 121C 122C The xor and rcl to handle the carry bit could be an sbb instead, with the 123C the carry bit add becoming a sub, but that doesn't save anything. 124 125L(top): 126 C eax (low product) 127 C ebx src end 128 C ecx carry bit, 0 or 1 129 C edx (high product, being carry limb) 130 C esi divisor 131 C edi inverse 132 C ebp counter, limbs, negative 133 134 mull %esi 135 136 movl (%ebx,%ebp,4), %eax 137 addl %ecx, %edx C apply carry bit to carry limb 138 139L(entry): 140 xorl %ecx, %ecx 141 subl %edx, %eax C apply carry limb 142 143 rcll %ecx 144 145 imull %edi, %eax 146 147 incl %ebp 148 jnz L(top) 149 150 151 152 popl %ebx 153 popl %ebp 154 155 mull %esi 156 157 popl %edi 158 popl %esi 159 160 leal (%ecx,%edx), %eax 161 162 ret 163 164EPILOGUE() 165