1dnl Intel Pentium-4 mpn_addlsh1_n -- mpn x+2*y. 2 3dnl Copyright 2001-2004, 2006 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31include(`../config.m4') 32 33 34C cycles/limb 35C dst!=src1,2 dst==src1 dst==src2 36C P6 model 0-8,10-12 - 37C P6 model 9 (Banias) ? 38C P6 model 13 (Dothan) ? 39C P4 model 0-1 (Willamette) ? 40C P4 model 2 (Northwood) 4.25 6 6 41C P4 model 3-4 (Prescott) 5 8.5 8.5 42 43C The slightly strange combination of indexing and pointer incrementing 44C that's used seems to work best. Not sure why, but %ecx,4 with src1 and/or 45C src2 is a slowdown. 46C 47C The dependent chain is simply the paddq of x+2*y to the previous carry, 48C then psrlq to get the new carry. That makes 4 c/l the target speed, which 49C is almost achieved for separate src/dst but when src==dst the write 50C combining anomalies slow it down. 51 52defframe(PARAM_SIZE, 16) 53defframe(PARAM_SRC2, 12) 54defframe(PARAM_SRC1, 8) 55defframe(PARAM_DST, 4) 56 57dnl re-use parameter space 58define(SAVE_EBX,`PARAM_SRC1') 59 60 TEXT 61 ALIGN(8) 62 63PROLOGUE(mpn_addlsh1_n) 64deflit(`FRAME',0) 65 66 mov PARAM_SRC1, %eax 67 mov %ebx, SAVE_EBX 68 69 mov PARAM_SRC2, %ebx 70 pxor %mm0, %mm0 C initial carry 71 72 mov PARAM_DST, %edx 73 74 mov PARAM_SIZE, %ecx 75 76 lea (%edx,%ecx,4), %edx C dst end 77 neg %ecx C -size 78 79L(top): 80 C eax src1 end 81 C ebx src2 end 82 C ecx counter, limbs, negative 83 C edx dst end 84 C mm0 carry 85 86 movd (%ebx), %mm2 87 movd (%eax), %mm1 88 psrlq $32, %mm0 89 lea 4(%eax), %eax 90 lea 4(%ebx), %ebx 91 92 psllq $1, %mm2 93 paddq %mm2, %mm1 94 95 paddq %mm1, %mm0 96 97 movd %mm0, (%edx,%ecx,4) 98 add $1, %ecx 99 jnz L(top) 100 101 102 psrlq $32, %mm0 103 mov SAVE_EBX, %ebx 104 movd %mm0, %eax 105 emms 106 ret 107 108EPILOGUE() 109