1dnl AMD64 mpn_add_n, mpn_sub_n 2 3dnl Copyright 2003-2005, 2007, 2008, 2010-2012 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31include(`../config.m4') 32 33C cycles/limb 34C AMD K8,K9 1.5 35C AMD K10 1.5 36C AMD bd1 1.8 37C AMD bd2 1.74 38C AMD bd3 ? 39C AMD bd4 1.78 40C AMD zen 1.5 41C AMD bt1 2.54 42C AMD bt2 2.15 43C Intel P4 11.5 44C Intel core2 4.9 45C Intel NHM 5.53 46C Intel SBR 1.59 47C Intel IBR 1.55 48C Intel HWL 1.44 49C Intel BWL 1.14 50C Intel SKL 1.21 51C Intel atom 4 52C Intel SLM 3 53C VIA nano 3.25 54 55C The loop of this code is the result of running a code generation and 56C optimization tool suite written by David Harvey and Torbjorn Granlund. 57 58C INPUT PARAMETERS 59define(`rp', `%rdi') C rcx 60define(`up', `%rsi') C rdx 61define(`vp', `%rdx') C r8 62define(`n', `%rcx') C r9 63define(`cy', `%r8') C rsp+40 (mpn_add_nc and mpn_sub_nc) 64 65ifdef(`OPERATION_add_n', ` 66 define(ADCSBB, adc) 67 define(func, mpn_add_n) 68 define(func_nc, mpn_add_nc)') 69ifdef(`OPERATION_sub_n', ` 70 define(ADCSBB, sbb) 71 define(func, mpn_sub_n) 72 define(func_nc, mpn_sub_nc)') 73 74MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) 75 76ABI_SUPPORT(DOS64) 77ABI_SUPPORT(STD64) 78 79ASM_START() 80 TEXT 81 ALIGN(16) 82PROLOGUE(func_nc) 83 FUNC_ENTRY(4) 84IFDOS(` mov 56(%rsp), %r8 ') 85 mov R32(n), R32(%rax) 86 shr $2, n 87 and $3, R32(%rax) 88 bt $0, %r8 C cy flag <- carry parameter 89 jrcxz L(lt4) 90 91 mov (up), %r8 92 mov 8(up), %r9 93 dec n 94 jmp L(mid) 95 96EPILOGUE() 97 ALIGN(16) 98PROLOGUE(func) 99 FUNC_ENTRY(4) 100 mov R32(n), R32(%rax) 101 shr $2, n 102 and $3, R32(%rax) 103 jrcxz L(lt4) 104 105 mov (up), %r8 106 mov 8(up), %r9 107 dec n 108 jmp L(mid) 109 110L(lt4): dec R32(%rax) 111 mov (up), %r8 112 jnz L(2) 113 ADCSBB (vp), %r8 114 mov %r8, (rp) 115 adc R32(%rax), R32(%rax) 116 FUNC_EXIT() 117 ret 118 119L(2): dec R32(%rax) 120 mov 8(up), %r9 121 jnz L(3) 122 ADCSBB (vp), %r8 123 ADCSBB 8(vp), %r9 124 mov %r8, (rp) 125 mov %r9, 8(rp) 126 adc R32(%rax), R32(%rax) 127 FUNC_EXIT() 128 ret 129 130L(3): mov 16(up), %r10 131 ADCSBB (vp), %r8 132 ADCSBB 8(vp), %r9 133 ADCSBB 16(vp), %r10 134 mov %r8, (rp) 135 mov %r9, 8(rp) 136 mov %r10, 16(rp) 137 setc R8(%rax) 138 FUNC_EXIT() 139 ret 140 141 ALIGN(16) 142L(top): ADCSBB (vp), %r8 143 ADCSBB 8(vp), %r9 144 ADCSBB 16(vp), %r10 145 ADCSBB 24(vp), %r11 146 mov %r8, (rp) 147 lea 32(up), up 148 mov %r9, 8(rp) 149 mov %r10, 16(rp) 150 dec n 151 mov %r11, 24(rp) 152 lea 32(vp), vp 153 mov (up), %r8 154 mov 8(up), %r9 155 lea 32(rp), rp 156L(mid): mov 16(up), %r10 157 mov 24(up), %r11 158 jnz L(top) 159 160L(end): lea 32(up), up 161 ADCSBB (vp), %r8 162 ADCSBB 8(vp), %r9 163 ADCSBB 16(vp), %r10 164 ADCSBB 24(vp), %r11 165 lea 32(vp), vp 166 mov %r8, (rp) 167 mov %r9, 8(rp) 168 mov %r10, 16(rp) 169 mov %r11, 24(rp) 170 lea 32(rp), rp 171 172 inc R32(%rax) 173 dec R32(%rax) 174 jnz L(lt4) 175 adc R32(%rax), R32(%rax) 176 FUNC_EXIT() 177 ret 178EPILOGUE() 179