1dnl AMD K7 mpn_addlsh1_n -- rp[] = up[] + (vp[] << 1) 2 3dnl Copyright 2011 Free Software Foundation, Inc. 4 5dnl Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato. 6 7dnl This file is part of the GNU MP Library. 8dnl 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of either: 11dnl 12dnl * the GNU Lesser General Public License as published by the Free 13dnl Software Foundation; either version 3 of the License, or (at your 14dnl option) any later version. 15dnl 16dnl or 17dnl 18dnl * the GNU General Public License as published by the Free Software 19dnl Foundation; either version 2 of the License, or (at your option) any 20dnl later version. 21dnl 22dnl or both in parallel, as here. 23dnl 24dnl The GNU MP Library is distributed in the hope that it will be useful, but 25dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27dnl for more details. 28dnl 29dnl You should have received copies of the GNU General Public License and the 30dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31dnl see https://www.gnu.org/licenses/. 32 33include(`../config.m4') 34 35C This is an attempt at an addlsh1_n for x86-32, not relying on sse2 insns. 36C The innerloop is 2*3-way unrolled, which is best we can do with the available 37C registers. It seems tricky to use the same structure for rsblsh1_n, since we 38C cannot feed carry between operations there. 39 40C cycles/limb 41C P5 42C P6 model 0-8,10-12 43C P6 model 9 (Banias) 44C P6 model 13 (Dothan) 5.4 (worse than add_n + lshift) 45C P4 model 0 (Willamette) 46C P4 model 1 (?) 47C P4 model 2 (Northwood) 48C P4 model 3 (Prescott) 49C P4 model 4 (Nocona) 50C Intel Atom 6 51C AMD K6 ? 52C AMD K7 2.5 53C AMD K8 54 55C This is a basic addlsh1_n for k7, atom, and perhaps some other x86-32 56C processors. It uses 2*3-way unrolling, for good reasons. Unfortunately, 57C that means we need an initial magic multiply. 58C 59C It is not clear how to do sublsh1_n or rsblsh1_n using the same pattern. We 60C cannot do rsblsh1_n since we feed carry from the shift blocks to the 61C add/subtract blocks, which is right for addition but reversed for 62C subtraction. We could perhaps do sublsh1_n, with some extra move insns, 63C without losing any time, since we're not issue limited but carry recurrency 64C latency. 65C 66C Breaking carry recurrency might be a good idea. We would then need separate 67C registers for the shift carry and add/subtract carry, which in turn would 68C force us to 2*2-way unrolling. 69 70defframe(PARAM_SIZE, 16) 71defframe(PARAM_DBLD, 12) 72defframe(PARAM_SRC, 8) 73defframe(PARAM_DST, 4) 74 75dnl re-use parameter space 76define(VAR_COUNT,`PARAM_DST') 77define(VAR_TMP,`PARAM_DBLD') 78 79ASM_START() 80 TEXT 81 ALIGN(8) 82PROLOGUE(mpn_addlsh1_n) 83deflit(`FRAME',0) 84 85define(`rp', `%edi') 86define(`up', `%esi') 87define(`vp', `%ebp') 88 89 mov $0x2aaaaaab, %eax 90 91 push %ebx FRAME_pushl() 92 mov PARAM_SIZE, %ebx C size 93 94 push rp FRAME_pushl() 95 mov PARAM_DST, rp 96 97 mul %ebx 98 99 push up FRAME_pushl() 100 mov PARAM_SRC, up 101 102 not %edx C count = -(size\8)-1 103 mov %edx, VAR_COUNT 104 105 push vp FRAME_pushl() 106 mov PARAM_DBLD, vp 107 108 lea 3(%edx,%edx,2), %ecx C count*3+3 = -(size\6)*3 109 xor %edx, %edx 110 lea (%ebx,%ecx,2), %ebx C size + (count*3+3)*2 = size % 6 111 or %ebx, %ebx 112 jz L(exact) 113 114L(oop): 115ifdef(`CPU_P6',` 116 shr %edx ') C restore 2nd saved carry bit 117 mov (vp), %eax 118 adc %eax, %eax 119 rcr %edx C restore 1st saved carry bit 120 lea 4(vp), vp 121 adc (up), %eax 122 lea 4(up), up 123 adc %edx, %edx C save a carry bit in edx 124ifdef(`CPU_P6',` 125 adc %edx, %edx ') C save another carry bit in edx 126 dec %ebx 127 mov %eax, (rp) 128 lea 4(rp), rp 129 jnz L(oop) 130 mov vp, VAR_TMP 131L(exact): 132 incl VAR_COUNT 133 jz L(end) 134 135 ALIGN(16) 136L(top): 137ifdef(`CPU_P6',` 138 shr %edx ') C restore 2nd saved carry bit 139 mov (vp), %eax 140 adc %eax, %eax 141 mov 4(vp), %ebx 142 adc %ebx, %ebx 143 mov 8(vp), %ecx 144 adc %ecx, %ecx 145 146 rcr %edx C restore 1st saved carry bit 147 148 adc (up), %eax 149 mov %eax, (rp) 150 adc 4(up), %ebx 151 mov %ebx, 4(rp) 152 adc 8(up), %ecx 153 mov %ecx, 8(rp) 154 155 mov 12(vp), %eax 156 adc %eax, %eax 157 mov 16(vp), %ebx 158 adc %ebx, %ebx 159 mov 20(vp), %ecx 160 adc %ecx, %ecx 161 162 lea 24(vp), vp 163 adc %edx, %edx C save a carry bit in edx 164 165 adc 12(up), %eax 166 mov %eax, 12(rp) 167 adc 16(up), %ebx 168 mov %ebx, 16(rp) 169 adc 20(up), %ecx 170 171 lea 24(up), up 172 173ifdef(`CPU_P6',` 174 adc %edx, %edx ') C save another carry bit in edx 175 mov %ecx, 20(rp) 176 incl VAR_COUNT 177 lea 24(rp), rp 178 jne L(top) 179 180L(end): 181 pop vp FRAME_popl() 182 pop up FRAME_popl() 183 184ifdef(`CPU_P6',` 185 xor %eax, %eax 186 shr $1, %edx 187 adc %edx, %eax 188',` 189 adc $0, %edx 190 mov %edx, %eax 191') 192 pop rp FRAME_popl() 193 pop %ebx FRAME_popl() 194 ret 195EPILOGUE() 196ASM_END() 197