1dnl AMD64 mpn_addmul_1 and mpn_submul_1. 2 3dnl Copyright 2003, 2004, 2005, 2007, 2008, 2011, 2012 Free Software 4dnl Foundation, Inc. 5 6dnl This file is part of the GNU MP Library. 7 8dnl The GNU MP Library is free software; you can redistribute it and/or modify 9dnl it under the terms of the GNU Lesser General Public License as published 10dnl by the Free Software Foundation; either version 3 of the License, or (at 11dnl your option) any later version. 12 13dnl The GNU MP Library is distributed in the hope that it will be useful, but 14dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 15dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 16dnl License for more details. 17 18dnl You should have received a copy of the GNU Lesser General Public License 19dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 20 21include(`../config.m4') 22 23C cycles/limb 24C AMD K8,K9 2.5 25C AMD K10 2.5 26C AMD bd1 5.0 27C AMD bobcat 6.17 28C Intel P4 14.9 29C Intel core2 5.09 30C Intel NHM 4.9 31C Intel SBR 4.0 32C Intel atom 21.3 33C VIA nano 5.0 34 35C The loop of this code is the result of running a code generation and 36C optimization tool suite written by David Harvey and Torbjorn Granlund. 37 38C TODO 39C * The loop is great, but the prologue and epilogue code was quickly written. 40C Tune it! 41 42define(`rp', `%rdi') C rcx 43define(`up', `%rsi') C rdx 44define(`n_param', `%rdx') C r8 45define(`vl', `%rcx') C r9 46 47define(`n', `%r11') 48 49ifdef(`OPERATION_addmul_1',` 50 define(`ADDSUB', `add') 51 define(`func', `mpn_addmul_1') 52') 53ifdef(`OPERATION_submul_1',` 54 define(`ADDSUB', `sub') 55 define(`func', `mpn_submul_1') 56') 57 58ABI_SUPPORT(DOS64) 59ABI_SUPPORT(STD64) 60 61MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) 62 63IFDOS(` define(`up', ``%rsi'') ') dnl 64IFDOS(` define(`rp', ``%rcx'') ') dnl 65IFDOS(` define(`vl', ``%r9'') ') dnl 66IFDOS(` define(`r9', ``rdi'') ') dnl 67IFDOS(` define(`n', ``%r8'') ') dnl 68IFDOS(` define(`r8', ``r11'') ') dnl 69 70ASM_START() 71 TEXT 72 ALIGN(16) 73PROLOGUE(func) 74 75IFDOS(``push %rsi '') 76IFDOS(``push %rdi '') 77IFDOS(``mov %rdx, %rsi '') 78 79 mov (up), %rax C read first u limb early 80 push %rbx 81IFSTD(` mov n_param, %rbx ') C move away n from rdx, mul uses it 82IFDOS(` mov n, %rbx ') 83 mul vl 84IFSTD(` mov %rbx, n ') 85 86 and $3, R32(%rbx) 87 jz L(b0) 88 cmp $2, R32(%rbx) 89 jz L(b2) 90 jg L(b3) 91 92L(b1): dec n 93 jne L(gt1) 94 ADDSUB %rax, (rp) 95 jmp L(ret) 96L(gt1): lea 8(up,n,8), up 97 lea -8(rp,n,8), rp 98 neg n 99 xor %r10, %r10 100 xor R32(%rbx), R32(%rbx) 101 mov %rax, %r9 102 mov (up,n,8), %rax 103 mov %rdx, %r8 104 jmp L(L1) 105 106L(b0): lea (up,n,8), up 107 lea -16(rp,n,8), rp 108 neg n 109 xor %r10, %r10 110 mov %rax, %r8 111 mov %rdx, %rbx 112 jmp L(L0) 113 114L(b3): lea -8(up,n,8), up 115 lea -24(rp,n,8), rp 116 neg n 117 mov %rax, %rbx 118 mov %rdx, %r10 119 jmp L(L3) 120 121L(b2): lea -16(up,n,8), up 122 lea -32(rp,n,8), rp 123 neg n 124 xor %r8, %r8 125 xor R32(%rbx), R32(%rbx) 126 mov %rax, %r10 127 mov 24(up,n,8), %rax 128 mov %rdx, %r9 129 jmp L(L2) 130 131 ALIGN(16) 132L(top): ADDSUB %r10, (rp,n,8) 133 adc %rax, %r9 134 mov (up,n,8), %rax 135 adc %rdx, %r8 136 mov $0, R32(%r10) 137L(L1): mul vl 138 ADDSUB %r9, 8(rp,n,8) 139 adc %rax, %r8 140 adc %rdx, %rbx 141L(L0): mov 8(up,n,8), %rax 142 mul vl 143 ADDSUB %r8, 16(rp,n,8) 144 adc %rax, %rbx 145 adc %rdx, %r10 146L(L3): mov 16(up,n,8), %rax 147 mul vl 148 ADDSUB %rbx, 24(rp,n,8) 149 mov $0, R32(%r8) C zero 150 mov %r8, %rbx C zero 151 adc %rax, %r10 152 mov 24(up,n,8), %rax 153 mov %r8, %r9 C zero 154 adc %rdx, %r9 155L(L2): mul vl 156 add $4, n 157 js L(top) 158 159 ADDSUB %r10, (rp,n,8) 160 adc %rax, %r9 161 adc %r8, %rdx 162 ADDSUB %r9, 8(rp,n,8) 163L(ret): adc $0, %rdx 164 mov %rdx, %rax 165 166 pop %rbx 167IFDOS(``pop %rdi '') 168IFDOS(``pop %rsi '') 169 ret 170EPILOGUE() 171