1dnl AMD64 mpn_add_n, mpn_sub_n 2 3dnl Copyright 2003, 2004, 2005, 2007, 2008, 2010, 2011, 2012 Free Software 4dnl Foundation, Inc. 5 6dnl This file is part of the GNU MP Library. 7 8dnl The GNU MP Library is free software; you can redistribute it and/or modify 9dnl it under the terms of the GNU Lesser General Public License as published 10dnl by the Free Software Foundation; either version 3 of the License, or (at 11dnl your option) any later version. 12 13dnl The GNU MP Library is distributed in the hope that it will be useful, but 14dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 15dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 16dnl License for more details. 17 18dnl You should have received a copy of the GNU Lesser General Public License 19dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 20 21include(`../config.m4') 22 23C cycles/limb 24C AMD K8,K9 1.5 25C AMD K10 1.5 26C Intel P4 ? 27C Intel core2 4.9 28C Intel NHM 5.5 29C Intel SBR 1.59 30C Intel atom 4 31C VIA nano 3.25 32 33C The loop of this code is the result of running a code generation and 34C optimization tool suite written by David Harvey and Torbjorn Granlund. 35 36C INPUT PARAMETERS 37define(`rp', `%rdi') C rcx 38define(`up', `%rsi') C rdx 39define(`vp', `%rdx') C r8 40define(`n', `%rcx') C r9 41define(`cy', `%r8') C rsp+40 (only for mpn_add_nc) 42 43ifdef(`OPERATION_add_n', ` 44 define(ADCSBB, adc) 45 define(func, mpn_add_n) 46 define(func_nc, mpn_add_nc)') 47ifdef(`OPERATION_sub_n', ` 48 define(ADCSBB, sbb) 49 define(func, mpn_sub_n) 50 define(func_nc, mpn_sub_nc)') 51 52MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) 53 54ABI_SUPPORT(DOS64) 55ABI_SUPPORT(STD64) 56 57ASM_START() 58 TEXT 59 ALIGN(16) 60PROLOGUE(func_nc) 61 FUNC_ENTRY(4) 62IFDOS(` mov 56(%rsp), %r8 ') 63 mov R32(n), R32(%rax) 64 shr $2, n 65 and $3, R32(%rax) 66 bt $0, %r8 C cy flag <- carry parameter 67 jrcxz L(lt4) 68 69 mov (up), %r8 70 mov 8(up), %r9 71 dec n 72 jmp L(mid) 73 74EPILOGUE() 75 ALIGN(16) 76PROLOGUE(func) 77 FUNC_ENTRY(4) 78 mov R32(n), R32(%rax) 79 shr $2, n 80 and $3, R32(%rax) 81 jrcxz L(lt4) 82 83 mov (up), %r8 84 mov 8(up), %r9 85 dec n 86 jmp L(mid) 87 88L(lt4): dec R32(%rax) 89 mov (up), %r8 90 jnz L(2) 91 ADCSBB (vp), %r8 92 mov %r8, (rp) 93 adc R32(%rax), R32(%rax) 94 FUNC_EXIT() 95 ret 96 97L(2): dec R32(%rax) 98 mov 8(up), %r9 99 jnz L(3) 100 ADCSBB (vp), %r8 101 ADCSBB 8(vp), %r9 102 mov %r8, (rp) 103 mov %r9, 8(rp) 104 adc R32(%rax), R32(%rax) 105 FUNC_EXIT() 106 ret 107 108L(3): mov 16(up), %r10 109 ADCSBB (vp), %r8 110 ADCSBB 8(vp), %r9 111 ADCSBB 16(vp), %r10 112 mov %r8, (rp) 113 mov %r9, 8(rp) 114 mov %r10, 16(rp) 115 setc R8(%rax) 116 FUNC_EXIT() 117 ret 118 119 ALIGN(16) 120L(top): ADCSBB (vp), %r8 121 ADCSBB 8(vp), %r9 122 ADCSBB 16(vp), %r10 123 ADCSBB 24(vp), %r11 124 mov %r8, (rp) 125 lea 32(up), up 126 mov %r9, 8(rp) 127 mov %r10, 16(rp) 128 dec n 129 mov %r11, 24(rp) 130 lea 32(vp), vp 131 mov (up), %r8 132 mov 8(up), %r9 133 lea 32(rp), rp 134L(mid): mov 16(up), %r10 135 mov 24(up), %r11 136 jnz L(top) 137 138L(end): lea 32(up), up 139 ADCSBB (vp), %r8 140 ADCSBB 8(vp), %r9 141 ADCSBB 16(vp), %r10 142 ADCSBB 24(vp), %r11 143 lea 32(vp), vp 144 mov %r8, (rp) 145 mov %r9, 8(rp) 146 mov %r10, 16(rp) 147 mov %r11, 24(rp) 148 lea 32(rp), rp 149 150 inc R32(%rax) 151 dec R32(%rax) 152 jnz L(lt4) 153 adc R32(%rax), R32(%rax) 154 FUNC_EXIT() 155 ret 156EPILOGUE() 157