1dnl Intel Pentium-4 mpn_rsh1add_n -- mpn (x+y)/2 2 3dnl Copyright 2001, 2002, 2003, 2004 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or 8dnl modify it under the terms of the GNU Lesser General Public License as 9dnl published by the Free Software Foundation; either version 3 of the 10dnl License, or (at your option) any later version. 11dnl 12dnl The GNU MP Library is distributed in the hope that it will be useful, 13dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 14dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15dnl Lesser General Public License for more details. 16dnl 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22 23C cycles/limb (approx) 24C dst!=src1,2 dst==src1 dst==src2 25C P4: 4.5 6.5 6.5 26 27 28C mp_limb_t mpn_rsh1add_n (mp_ptr wp, mp_srcptr xp, mp_srcptr yp, 29C mp_size_t size); 30C 31C The slightly strange combination of indexing and pointer incrementing 32C that's used seems to work best. Not sure why, but for instance leal 33C incrementing on %esi is a 1 or 2 cycle slowdown. 34C 35C The dependent chain is paddq combining the carry and next (shifted) part, 36C plus psrlq to move the new carry down. That, and just 4 mmx instructions 37C in total, makes 4 c/l the target speed, which is almost achieved for 38C separate src/dst but when src==dst the write combining anomalies slow it 39C down. 40 41defframe(PARAM_SIZE, 16) 42defframe(PARAM_YP, 12) 43defframe(PARAM_XP, 8) 44defframe(PARAM_WP, 4) 45 46dnl re-use parameter space 47define(SAVE_EBX,`PARAM_XP') 48define(SAVE_ESI,`PARAM_YP') 49 50 TEXT 51 ALIGN(8) 52 53PROLOGUE(mpn_rsh1add_n) 54deflit(`FRAME',0) 55 56 movl PARAM_XP, %edx 57 movl %ebx, SAVE_EBX 58 59 movl PARAM_YP, %ebx 60 movl %esi, SAVE_ESI 61 62 movl PARAM_WP, %esi 63 64 movd (%edx), %mm0 C xp[0] 65 66 movd (%ebx), %mm1 C yp[0] 67 movl PARAM_SIZE, %ecx 68 69 movl (%edx), %eax C xp[0] 70 71 addl (%ebx), %eax C xp[0]+yp[0] 72 73 paddq %mm1, %mm0 C xp[0]+yp[0] 74 leal (%esi,%ecx,4), %esi C wp end 75 negl %ecx C -size 76 77 psrlq $1, %mm0 C (xp[0]+yp[0])/2 78 and $1, %eax C return value, rsh1 bit of xp[0]+yp[0] 79 addl $1, %ecx C -(size-1) 80 jz L(done) 81 82 83L(top): 84 C eax return value 85 C ebx yp end 86 C ecx counter, limbs, -(size-1) to -1 inclusive 87 C edx xp end 88 C esi wp end 89 C mm0 carry (32 bits) 90 91 movd 4(%edx), %mm1 C xp[i+1] 92 movd 4(%ebx), %mm2 C yp[i+1] 93 leal 4(%edx), %edx 94 leal 4(%ebx), %ebx 95 paddq %mm2, %mm1 C xp[i+1]+yp[i+1] 96 psllq $31, %mm1 C low bit at 31, further 32 above 97 98 paddq %mm1, %mm0 C 31 and carry from prev add 99 movd %mm0, -4(%esi,%ecx,4) C low ready to store dst[i] 100 101 psrlq $32, %mm0 C high becomes new carry 102 103 addl $1, %ecx 104 jnz L(top) 105 106 107L(done): 108 movd %mm0, -4(%esi) C dst[size-1] 109 movl SAVE_EBX, %ebx 110 111 movl SAVE_ESI, %esi 112 emms 113 ret 114 115EPILOGUE() 116