1dnl AMD64 mpn_rsh1add_n -- rp[] = (up[] + vp[]) >> 1 2dnl AMD64 mpn_rsh1sub_n -- rp[] = (up[] - vp[]) >> 1 3 4dnl Copyright 2003, 2005, 2009, 2011, 2012 Free Software Foundation, Inc. 5 6dnl This file is part of the GNU MP Library. 7 8dnl The GNU MP Library is free software; you can redistribute it and/or modify 9dnl it under the terms of the GNU Lesser General Public License as published 10dnl by the Free Software Foundation; either version 3 of the License, or (at 11dnl your option) any later version. 12 13dnl The GNU MP Library is distributed in the hope that it will be useful, but 14dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 15dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 16dnl License for more details. 17 18dnl You should have received a copy of the GNU Lesser General Public License 19dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 20 21include(`../config.m4') 22 23C cycles/limb 24C AMD K8,K9 2.14 (mpn_add_n + mpn_rshift need 4.125) 25C AMD K10 2.14 (mpn_add_n + mpn_rshift need 4.125) 26C Intel P4 12.75 27C Intel core2 3.75 28C Intel NMH 4.4 29C Intel SBR ? 30C Intel atom ? 31C VIA nano 3.25 32 33C TODO 34C * Rewrite to use indexed addressing, like addlsh1.asm and sublsh1.asm. 35 36C INPUT PARAMETERS 37define(`rp', `%rdi') 38define(`up', `%rsi') 39define(`vp', `%rdx') 40define(`n',` %rcx') 41 42ifdef(`OPERATION_rsh1add_n', ` 43 define(ADDSUB, add) 44 define(ADCSBB, adc) 45 define(func_n, mpn_rsh1add_n) 46 define(func_nc, mpn_rsh1add_nc)') 47ifdef(`OPERATION_rsh1sub_n', ` 48 define(ADDSUB, sub) 49 define(ADCSBB, sbb) 50 define(func_n, mpn_rsh1sub_n) 51 define(func_nc, mpn_rsh1sub_nc)') 52 53MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1add_nc mpn_rsh1sub_n mpn_rsh1sub_nc) 54 55ABI_SUPPORT(DOS64) 56ABI_SUPPORT(STD64) 57 58ASM_START() 59 TEXT 60 ALIGN(16) 61PROLOGUE(func_nc) 62 FUNC_ENTRY(4) 63IFDOS(` mov 56(%rsp), %r8 ') 64 push %rbx 65 66 xor R32(%rax), R32(%rax) 67 neg %r8 C set C flag from parameter 68 mov (up), %rbx 69 ADCSBB (vp), %rbx 70 jmp L(ent) 71EPILOGUE() 72 73 ALIGN(16) 74PROLOGUE(func_n) 75 FUNC_ENTRY(4) 76 push %rbx 77 78 xor R32(%rax), R32(%rax) 79 mov (up), %rbx 80 ADDSUB (vp), %rbx 81L(ent): 82 rcr %rbx C rotate, save acy 83 adc R32(%rax), R32(%rax) C return value 84 85 mov R32(n), R32(%r11) 86 and $3, R32(%r11) 87 88 cmp $1, R32(%r11) 89 je L(do) C jump if n = 1 5 9 ... 90 91L(n1): cmp $2, R32(%r11) 92 jne L(n2) C jump unless n = 2 6 10 ... 93 add %rbx, %rbx C rotate carry limb, restore acy 94 mov 8(up), %r10 95 ADCSBB 8(vp), %r10 96 lea 8(up), up 97 lea 8(vp), vp 98 lea 8(rp), rp 99 rcr %r10 100 rcr %rbx 101 mov %rbx, -8(rp) 102 jmp L(cj1) 103 104L(n2): cmp $3, R32(%r11) 105 jne L(n3) C jump unless n = 3 7 11 ... 106 add %rbx, %rbx C rotate carry limb, restore acy 107 mov 8(up), %r9 108 mov 16(up), %r10 109 ADCSBB 8(vp), %r9 110 ADCSBB 16(vp), %r10 111 lea 16(up), up 112 lea 16(vp), vp 113 lea 16(rp), rp 114 rcr %r10 115 rcr %r9 116 rcr %rbx 117 mov %rbx, -16(rp) 118 jmp L(cj2) 119 120L(n3): dec n C come here for n = 4 8 12 ... 121 add %rbx, %rbx C rotate carry limb, restore acy 122 mov 8(up), %r8 123 mov 16(up), %r9 124 ADCSBB 8(vp), %r8 125 ADCSBB 16(vp), %r9 126 mov 24(up), %r10 127 ADCSBB 24(vp), %r10 128 lea 24(up), up 129 lea 24(vp), vp 130 lea 24(rp), rp 131 rcr %r10 132 rcr %r9 133 rcr %r8 134 rcr %rbx 135 mov %rbx, -24(rp) 136 mov %r8, -16(rp) 137L(cj2): mov %r9, -8(rp) 138L(cj1): mov %r10, %rbx 139 140L(do): 141 shr $2, n C 4 142 je L(end) C 2 143 ALIGN(16) 144L(top): add %rbx, %rbx C rotate carry limb, restore acy 145 146 mov 8(up), %r8 147 mov 16(up), %r9 148 ADCSBB 8(vp), %r8 149 ADCSBB 16(vp), %r9 150 mov 24(up), %r10 151 mov 32(up), %r11 152 ADCSBB 24(vp), %r10 153 ADCSBB 32(vp), %r11 154 155 lea 32(up), up 156 lea 32(vp), vp 157 158 rcr %r11 C rotate, save acy 159 rcr %r10 160 rcr %r9 161 rcr %r8 162 163 rcr %rbx 164 mov %rbx, (rp) 165 mov %r8, 8(rp) 166 mov %r9, 16(rp) 167 mov %r10, 24(rp) 168 mov %r11, %rbx 169 170 lea 32(rp), rp 171 dec n 172 jne L(top) 173 174L(end): mov %rbx, (rp) 175 pop %rbx 176 FUNC_EXIT() 177 ret 178EPILOGUE() 179