1dnl Intel Pentium-4 mpn_submul_1 -- Multiply a limb vector with a limb and 2dnl subtract the result from a second limb vector. 3 4dnl Copyright 2001, 2002, 2008, 2010 Free Software Foundation, Inc. 5 6dnl This file is part of the GNU MP Library. 7dnl 8dnl The GNU MP Library is free software; you can redistribute it and/or modify 9dnl it under the terms of either: 10dnl 11dnl * the GNU Lesser General Public License as published by the Free 12dnl Software Foundation; either version 3 of the License, or (at your 13dnl option) any later version. 14dnl 15dnl or 16dnl 17dnl * the GNU General Public License as published by the Free Software 18dnl Foundation; either version 2 of the License, or (at your option) any 19dnl later version. 20dnl 21dnl or both in parallel, as here. 22dnl 23dnl The GNU MP Library is distributed in the hope that it will be useful, but 24dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 25dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 26dnl for more details. 27dnl 28dnl You should have received copies of the GNU General Public License and the 29dnl GNU Lesser General Public License along with the GNU MP Library. If not, 30dnl see https://www.gnu.org/licenses/. 31 32include(`../config.m4') 33 34 35C cycles/limb 36C P6 model 0-8,10-12 - 37C P6 model 9 (Banias) 6.8 38C P6 model 13 (Dothan) 6.9 39C P4 model 0-1 (Willamette) ? 40C P4 model 2 (Northwood) 5.87 41C P4 model 3-4 (Prescott) 6.5 42 43C This code represents a step forwards compared to the code available before 44C GMP 5.1, but it is not carefully tuned for either P6 or P4. In fact, it is 45C not good for P6. For P4 it saved a bit over 1 c/l for both Northwood and 46C Prescott compared to the old code. 47C 48C The arrangements made here to get a two instruction dependent chain are 49C slightly subtle. In the loop the carry (or borrow rather) is a negative so 50C that a paddq can be used to give a low limb ready to store, and a high limb 51C ready to become the new carry after a psrlq. 52C 53C If the carry was a simple twos complement negative then the psrlq shift would 54C need to bring in 0 bits or 1 bits according to whether the high was zero or 55C non-zero, since a non-zero value would represent a negative needing sign 56C extension. That wouldn't be particularly easy to arrange and certainly would 57C add an instruction to the dependent chain, so instead an offset is applied so 58C that the high limb will be 0xFFFFFFFF+c. With c in the range -0xFFFFFFFF to 59C 0, the value 0xFFFFFFFF+c is in the range 0 to 0xFFFFFFFF and is therefore 60C always positive and can always have 0 bits shifted in, which is what psrlq 61C does. 62C 63C The extra 0xFFFFFFFF must be subtracted before c is used, but that can be 64C done off the dependent chain. The total adjustment then is to add 65C 0xFFFFFFFF00000000 to offset the new carry, and subtract 0x00000000FFFFFFFF 66C to remove the offset from the current carry, for a net add of 67C 0xFFFFFFFE00000001. In the code this is applied to the destination limb when 68C fetched. 69C 70C It's also possible to view the 0xFFFFFFFF adjustment as a ones-complement 71C negative, which is how it's undone for the return value, but that doesn't 72C seem as clear. 73 74defframe(PARAM_CARRY, 20) 75defframe(PARAM_MULTIPLIER,16) 76defframe(PARAM_SIZE, 12) 77defframe(PARAM_SRC, 8) 78defframe(PARAM_DST, 4) 79 80 TEXT 81 ALIGN(16) 82 83PROLOGUE(mpn_submul_1c) 84deflit(`FRAME',0) 85 movd PARAM_CARRY, %mm1 86 jmp L(start_1c) 87EPILOGUE() 88 89PROLOGUE(mpn_submul_1) 90deflit(`FRAME',0) 91 pxor %mm1, %mm1 C initial borrow 92 93L(start_1c): 94 mov PARAM_SRC, %eax 95 pcmpeqd %mm0, %mm0 96 97 movd PARAM_MULTIPLIER, %mm7 98 pcmpeqd %mm6, %mm6 99 100 mov PARAM_DST, %edx 101 psrlq $32, %mm0 C 0x00000000FFFFFFFF 102 103 mov PARAM_SIZE, %ecx 104 psllq $32, %mm6 C 0xFFFFFFFF00000000 105 106 psubq %mm0, %mm6 C 0xFFFFFFFE00000001 107 108 psubq %mm1, %mm0 C 0xFFFFFFFF - borrow 109 110 111 movd (%eax), %mm3 C up 112 movd (%edx), %mm4 C rp 113 114 add $-1, %ecx 115 paddq %mm6, %mm4 C add 0xFFFFFFFE00000001 116 pmuludq %mm7, %mm3 117 jnz L(gt1) 118 psubq %mm3, %mm4 C prod 119 paddq %mm4, %mm0 C borrow 120 movd %mm0, (%edx) C result 121 jmp L(rt) 122 123L(gt1): movd 4(%eax), %mm1 C up 124 movd 4(%edx), %mm2 C rp 125 126 add $-1, %ecx 127 jz L(eev) 128 129 ALIGN(16) 130L(top): paddq %mm6, %mm2 C add 0xFFFFFFFE00000001 131 pmuludq %mm7, %mm1 132 psubq %mm3, %mm4 C prod 133 movd 8(%eax), %mm3 C up 134 paddq %mm4, %mm0 C borrow 135 movd 8(%edx), %mm4 C rp 136 movd %mm0, (%edx) C result 137 psrlq $32, %mm0 138 139 add $-1, %ecx 140 jz L(eod) 141 142 paddq %mm6, %mm4 C add 0xFFFFFFFE00000001 143 pmuludq %mm7, %mm3 144 psubq %mm1, %mm2 C prod 145 movd 12(%eax), %mm1 C up 146 paddq %mm2, %mm0 C borrow 147 movd 12(%edx), %mm2 C rp 148 movd %mm0, 4(%edx) C result 149 psrlq $32, %mm0 150 151 lea 8(%eax), %eax 152 lea 8(%edx), %edx 153 add $-1, %ecx 154 jnz L(top) 155 156 157L(eev): paddq %mm6, %mm2 C add 0xFFFFFFFE00000001 158 pmuludq %mm7, %mm1 159 psubq %mm3, %mm4 C prod 160 paddq %mm4, %mm0 C borrow 161 movd %mm0, (%edx) C result 162 psrlq $32, %mm0 163 psubq %mm1, %mm2 C prod 164 paddq %mm2, %mm0 C borrow 165 movd %mm0, 4(%edx) C result 166L(rt): psrlq $32, %mm0 167 movd %mm0, %eax 168 not %eax 169 emms 170 ret 171 172L(eod): paddq %mm6, %mm4 C add 0xFFFFFFFE00000001 173 pmuludq %mm7, %mm3 174 psubq %mm1, %mm2 C prod 175 paddq %mm2, %mm0 C borrow 176 movd %mm0, 4(%edx) C result 177 psrlq $32, %mm0 178 psubq %mm3, %mm4 C prod 179 paddq %mm4, %mm0 C borrow 180 movd %mm0, 8(%edx) C result 181 jmp L(rt) 182EPILOGUE() 183