1dnl AMD K7 mpn_sublsh1_n_ip1 -- rp[] = rp[] - (up[] << 1) 2 3dnl Copyright 2011 Free Software Foundation, Inc. 4 5dnl Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato. 6 7dnl This file is part of the GNU MP Library. 8 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of the GNU Lesser General Public License as published 11dnl by the Free Software Foundation; either version 3 of the License, or (at 12dnl your option) any later version. 13 14dnl The GNU MP Library is distributed in the hope that it will be useful, but 15dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 16dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 17dnl License for more details. 18 19dnl You should have received a copy of the GNU Lesser General Public License 20dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 21 22include(`../config.m4') 23 24C This is an attempt at a sublsh1_n for x86-32, not relying on sse2 insns. The 25C innerloop is 2*3-way unrolled, which is best we can do with the available 26C registers. It seems tricky to use the same structure for rsblsh1_n, since we 27C cannot feed carry between operations there. 28 29C cycles/limb 30C P5 31C P6 model 0-8,10-12 32C P6 model 9 (Banias) 33C P6 model 13 (Dothan) 34C P4 model 0 (Willamette) 35C P4 model 1 (?) 36C P4 model 2 (Northwood) 37C P4 model 3 (Prescott) 38C P4 model 4 (Nocona) 39C Intel Atom 6.75 40C AMD K6 41C AMD K7 42C AMD K8 43 44C This is a basic sublsh1_n for k7, atom, and perhaps some other x86-32 45C processors. It uses 2*4-way unrolling, for good reasons. 46C 47C Breaking carry recurrency might be a good idea. We would then need separate 48C registers for the shift carry and add/subtract carry, which in turn would 49C force is to 2*2-way unrolling. 50 51defframe(PARAM_SIZE, 12) 52defframe(PARAM_SRC, 8) 53defframe(PARAM_DST, 4) 54 55dnl re-use parameter space 56define(VAR_COUNT,`PARAM_SIZE') 57define(SAVE_EBX,`PARAM_SRC') 58define(SAVE_EBP,`PARAM_DST') 59 60ASM_START() 61 TEXT 62 ALIGN(8) 63PROLOGUE(mpn_sublsh1_n_ip1) 64deflit(`FRAME',0) 65 66define(`rp', `%edi') 67define(`up', `%esi') 68 69 mov PARAM_SIZE, %eax C size 70 push up FRAME_pushl() 71 push rp FRAME_pushl() 72 xor %edx, %edx 73 mov PARAM_SRC, up 74 mov PARAM_DST, rp 75 mov %ebx, SAVE_EBX 76 mov %eax, %ebx 77 shr $3, %eax 78 79 not %eax C count = -(size\8)-i 80 and $7, %ebx C size % 8 81 jz L(exact) 82 83L(oop): 84ifdef(`CPU_P6',` 85 shr %edx ') C restore 2nd saved carry bit 86 mov (up), %ecx 87 adc %ecx, %ecx 88 rcr %edx C restore 1st saved carry bit 89 lea 4(up), up 90 sbb %ecx, (rp) 91 lea 4(rp), rp 92 adc %edx, %edx C save a carry bit in edx 93ifdef(`CPU_P6',` 94 adc %edx, %edx ') C save another carry bit in edx 95 dec %ebx 96 jnz L(oop) 97L(exact): 98 inc %eax 99 jz L(end) 100 mov %eax, VAR_COUNT 101 mov %ebp, SAVE_EBP 102 103 ALIGN(16) 104L(top): 105ifdef(`CPU_P6',` 106 shr %edx ') C restore 2nd saved carry bit 107 mov (up), %eax 108 adc %eax, %eax 109 mov 4(up), %ebx 110 adc %ebx, %ebx 111 mov 8(up), %ecx 112 adc %ecx, %ecx 113 mov 12(up), %ebp 114 adc %ebp, %ebp 115 116 rcr %edx C restore 1st saved carry bit 117 118 sbb %eax, (rp) 119 sbb %ebx, 4(rp) 120 sbb %ecx, 8(rp) 121 sbb %ebp, 12(rp) 122 123 mov 16(up), %eax 124 adc %eax, %eax 125 mov 20(up), %ebx 126 adc %ebx, %ebx 127 mov 24(up), %ecx 128 adc %ecx, %ecx 129 mov 28(up), %ebp 130 adc %ebp, %ebp 131 132 lea 32(up), up 133 adc %edx, %edx C save a carry bit in edx 134 135 sbb %eax, 16(rp) 136 sbb %ebx, 20(rp) 137 sbb %ecx, 24(rp) 138 sbb %ebp, 28(rp) 139 140ifdef(`CPU_P6',` 141 adc %edx, %edx ') C save another carry bit in edx 142 incl VAR_COUNT 143 lea 32(rp), rp 144 jne L(top) 145 146 mov SAVE_EBP, %ebp 147L(end): 148 mov SAVE_EBX, %ebx 149 150ifdef(`CPU_P6',` 151 xor %eax, %eax 152 shr $1, %edx 153 adc %edx, %eax 154',` 155 adc $0, %edx 156 mov %edx, %eax 157') 158 pop rp FRAME_popl() 159 pop up FRAME_popl() 160 ret 161EPILOGUE() 162ASM_END() 163