1dnl Intel Pentium-4 mpn_submul_1 -- Multiply a limb vector with a limb and 2dnl subtract the result from a second limb vector. 3 4dnl Copyright 2001, 2002, 2008, 2010 Free Software Foundation, Inc. 5dnl 6dnl This file is part of the GNU MP Library. 7dnl 8dnl The GNU MP Library is free software; you can redistribute it and/or 9dnl modify it under the terms of the GNU Lesser General Public License as 10dnl published by the Free Software Foundation; either version 3 of the 11dnl License, or (at your option) any later version. 12dnl 13dnl The GNU MP Library is distributed in the hope that it will be useful, 14dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 15dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16dnl Lesser General Public License for more details. 17dnl 18dnl You should have received a copy of the GNU Lesser General Public License 19dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 20 21include(`../config.m4') 22 23 24C cycles/limb 25C P6 model 0-8,10-12 - 26C P6 model 9 (Banias) 6.8 27C P6 model 13 (Dothan) 6.9 28C P4 model 0-1 (Willamette) ? 29C P4 model 2 (Northwood) 5.87 30C P4 model 3-4 (Prescott) 6.5 31 32C This code represents a step forwards compared to the code available before 33C GMP 5.1, but it is not carefully tuned for either P6 or P4. In fact, it is 34C not good for P6. For P4 it saved a bit over 1 c/l for both Northwood and 35C Prescott compared to the old code. 36C 37C The arrangements made here to get a two instruction dependent chain are 38C slightly subtle. In the loop the carry (or borrow rather) is a negative so 39C that a paddq can be used to give a low limb ready to store, and a high limb 40C ready to become the new carry after a psrlq. 41C 42C If the carry was a simple twos complement negative then the psrlq shift would 43C need to bring in 0 bits or 1 bits according to whether the high was zero or 44C non-zero, since a non-zero value would represent a negative needing sign 45C extension. That wouldn't be particularly easy to arrange and certainly would 46C add an instruction to the dependent chain, so instead an offset is applied so 47C that the high limb will be 0xFFFFFFFF+c. With c in the range -0xFFFFFFFF to 48C 0, the value 0xFFFFFFFF+c is in the range 0 to 0xFFFFFFFF and is therefore 49C always positive and can always have 0 bits shifted in, which is what psrlq 50C does. 51C 52C The extra 0xFFFFFFFF must be subtracted before c is used, but that can be 53C done off the dependent chain. The total adjustment then is to add 54C 0xFFFFFFFF00000000 to offset the new carry, and subtract 0x00000000FFFFFFFF 55C to remove the offset from the current carry, for a net add of 56C 0xFFFFFFFE00000001. In the code this is applied to the destination limb when 57C fetched. 58C 59C It's also possible to view the 0xFFFFFFFF adjustment as a ones-complement 60C negative, which is how it's undone for the return value, but that doesn't 61C seem as clear. 62 63defframe(PARAM_CARRY, 20) 64defframe(PARAM_MULTIPLIER,16) 65defframe(PARAM_SIZE, 12) 66defframe(PARAM_SRC, 8) 67defframe(PARAM_DST, 4) 68 69 TEXT 70 ALIGN(16) 71 72PROLOGUE(mpn_submul_1c) 73deflit(`FRAME',0) 74 movd PARAM_CARRY, %mm1 75 jmp L(start_1c) 76EPILOGUE() 77 78PROLOGUE(mpn_submul_1) 79deflit(`FRAME',0) 80 pxor %mm1, %mm1 C initial borrow 81 82L(start_1c): 83 mov PARAM_SRC, %eax 84 pcmpeqd %mm0, %mm0 85 86 movd PARAM_MULTIPLIER, %mm7 87 pcmpeqd %mm6, %mm6 88 89 mov PARAM_DST, %edx 90 psrlq $32, %mm0 C 0x00000000FFFFFFFF 91 92 mov PARAM_SIZE, %ecx 93 psllq $32, %mm6 C 0xFFFFFFFF00000000 94 95 psubq %mm0, %mm6 C 0xFFFFFFFE00000001 96 97 psubq %mm1, %mm0 C 0xFFFFFFFF - borrow 98 99 100 movd (%eax), %mm3 C up 101 movd (%edx), %mm4 C rp 102 103 add $-1, %ecx 104 paddq %mm6, %mm4 C add 0xFFFFFFFE00000001 105 pmuludq %mm7, %mm3 106 jnz L(gt1) 107 psubq %mm3, %mm4 C prod 108 paddq %mm4, %mm0 C borrow 109 movd %mm0, (%edx) C result 110 jmp L(rt) 111 112L(gt1): movd 4(%eax), %mm1 C up 113 movd 4(%edx), %mm2 C rp 114 115 add $-1, %ecx 116 jz L(eev) 117 118 ALIGN(16) 119L(top): paddq %mm6, %mm2 C add 0xFFFFFFFE00000001 120 pmuludq %mm7, %mm1 121 psubq %mm3, %mm4 C prod 122 movd 8(%eax), %mm3 C up 123 paddq %mm4, %mm0 C borrow 124 movd 8(%edx), %mm4 C rp 125 movd %mm0, (%edx) C result 126 psrlq $32, %mm0 127 128 add $-1, %ecx 129 jz L(eod) 130 131 paddq %mm6, %mm4 C add 0xFFFFFFFE00000001 132 pmuludq %mm7, %mm3 133 psubq %mm1, %mm2 C prod 134 movd 12(%eax), %mm1 C up 135 paddq %mm2, %mm0 C borrow 136 movd 12(%edx), %mm2 C rp 137 movd %mm0, 4(%edx) C result 138 psrlq $32, %mm0 139 140 lea 8(%eax), %eax 141 lea 8(%edx), %edx 142 add $-1, %ecx 143 jnz L(top) 144 145 146L(eev): paddq %mm6, %mm2 C add 0xFFFFFFFE00000001 147 pmuludq %mm7, %mm1 148 psubq %mm3, %mm4 C prod 149 paddq %mm4, %mm0 C borrow 150 movd %mm0, (%edx) C result 151 psrlq $32, %mm0 152 psubq %mm1, %mm2 C prod 153 paddq %mm2, %mm0 C borrow 154 movd %mm0, 4(%edx) C result 155L(rt): psrlq $32, %mm0 156 movd %mm0, %eax 157 not %eax 158 emms 159 ret 160 161L(eod): paddq %mm6, %mm4 C add 0xFFFFFFFE00000001 162 pmuludq %mm7, %mm3 163 psubq %mm1, %mm2 C prod 164 paddq %mm2, %mm0 C borrow 165 movd %mm0, 4(%edx) C result 166 psrlq $32, %mm0 167 psubq %mm3, %mm4 C prod 168 paddq %mm4, %mm0 C borrow 169 movd %mm0, 8(%edx) C result 170 jmp L(rt) 171EPILOGUE() 172