1dnl Intel P6 mpn_lshsub_n -- mpn papillion support. 2 3dnl Copyright 2006 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of the GNU Lesser General Public License as published 9dnl by the Free Software Foundation; either version 3 of the License, or (at 10dnl your option) any later version. 11dnl 12dnl The GNU MP Library is distributed in the hope that it will be useful, but 13dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 14dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 15dnl License for more details. 16dnl 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22C P6/13: 3.35 cycles/limb (separate mpn_sub_n + mpn_lshift needs 4.12) 23 24C (1) The loop is is not scheduled in any way, and scheduling attempts have not 25C improved speed on P6/13. Presumably, the K7 will want scheduling, if it 26C at all wants to use MMX. 27C (2) We could save a register by not alternatingly using eax and edx in the 28C loop. 29 30define(`rp', `%edi') 31define(`up', `%esi') 32define(`vp', `%ebx') 33define(`n', `%ecx') 34define(`cnt', `%mm7') 35 36ASM_START() 37 38 TEXT 39 ALIGN(16) 40 41PROLOGUE(mpn_lshsub_n) 42 push %edi 43 push %esi 44 push %ebx 45 46 mov 16(%esp), rp 47 mov 20(%esp), up 48 mov 24(%esp), vp 49 mov 28(%esp), n 50 mov $32, %eax 51 sub 32(%esp), %eax 52 movd %eax, cnt 53 54 lea (up,n,4), up 55 lea (vp,n,4), vp 56 lea (rp,n,4), rp 57 58 neg n 59 mov n, %eax 60 and $-8, n 61 and $7, %eax 62 shl %eax C eax = 2x 63 lea (%eax,%eax,4), %edx C edx = 10x 64ifdef(`PIC',` 65 call L(pic_calc) 66L(here): 67',` 68 lea L(ent)(%eax,%edx,2), %eax C eax = 22x 69') 70 71 pxor %mm1, %mm1 72 pxor %mm0, %mm0 73 74 jmp *%eax 75 76ifdef(`PIC',` 77L(pic_calc): 78 C See mpn/x86/README about old gas bugs 79 lea (%eax,%edx,2), %eax 80 add $L(ent)-L(here), %eax 81 add (%esp), %eax 82 ret_internal 83') 84 85L(end): C compute (cy<<cnt) | (edx>>(32-cnt)) 86 sbb %eax, %eax 87 neg %eax 88 mov 32(%esp), %ecx 89 shld %cl, %edx, %eax 90 91 emms 92 93 pop %ebx 94 pop %esi 95 pop %edi 96 ret 97 ALIGN(16) 98L(top): jecxz L(end) 99L(ent): mov 0(up,n,4), %eax 100 sbb 0(vp,n,4), %eax 101 movd %eax, %mm0 102 punpckldq %mm0, %mm1 103 psrlq %mm7, %mm1 104 movd %mm1, 0(rp,n,4) 105 106 mov 4(up,n,4), %edx 107 sbb 4(vp,n,4), %edx 108 movd %edx, %mm1 109 punpckldq %mm1, %mm0 110 psrlq %mm7, %mm0 111 movd %mm0, 4(rp,n,4) 112 113 mov 8(up,n,4), %eax 114 sbb 8(vp,n,4), %eax 115 movd %eax, %mm0 116 punpckldq %mm0, %mm1 117 psrlq %mm7, %mm1 118 movd %mm1, 8(rp,n,4) 119 120 mov 12(up,n,4), %edx 121 sbb 12(vp,n,4), %edx 122 movd %edx, %mm1 123 punpckldq %mm1, %mm0 124 psrlq %mm7, %mm0 125 movd %mm0, 12(rp,n,4) 126 127 mov 16(up,n,4), %eax 128 sbb 16(vp,n,4), %eax 129 movd %eax, %mm0 130 punpckldq %mm0, %mm1 131 psrlq %mm7, %mm1 132 movd %mm1, 16(rp,n,4) 133 134 mov 20(up,n,4), %edx 135 sbb 20(vp,n,4), %edx 136 movd %edx, %mm1 137 punpckldq %mm1, %mm0 138 psrlq %mm7, %mm0 139 movd %mm0, 20(rp,n,4) 140 141 mov 24(up,n,4), %eax 142 sbb 24(vp,n,4), %eax 143 movd %eax, %mm0 144 punpckldq %mm0, %mm1 145 psrlq %mm7, %mm1 146 movd %mm1, 24(rp,n,4) 147 148 mov 28(up,n,4), %edx 149 sbb 28(vp,n,4), %edx 150 movd %edx, %mm1 151 punpckldq %mm1, %mm0 152 psrlq %mm7, %mm0 153 movd %mm0, 28(rp,n,4) 154 155 lea 8(n), n 156 jmp L(top) 157 158EPILOGUE() 159