1dnl AMD64 mpn_addmul_1 for CPUs with mulx and adx. 2 3dnl Contributed to the GNU project by Torbjörn Granlund. 4 5dnl Copyright 2012, 2013 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8dnl 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of either: 11dnl 12dnl * the GNU Lesser General Public License as published by the Free 13dnl Software Foundation; either version 3 of the License, or (at your 14dnl option) any later version. 15dnl 16dnl or 17dnl 18dnl * the GNU General Public License as published by the Free Software 19dnl Foundation; either version 2 of the License, or (at your option) any 20dnl later version. 21dnl 22dnl or both in parallel, as here. 23dnl 24dnl The GNU MP Library is distributed in the hope that it will be useful, but 25dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27dnl for more details. 28dnl 29dnl You should have received copies of the GNU General Public License and the 30dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31dnl see https://www.gnu.org/licenses/. 32 33include(`../config.m4') 34 35C cycles/limb 36C AMD K8,K9 - 37C AMD K10 - 38C AMD bd1 - 39C AMD bd2 - 40C AMD bd3 - 41C AMD bd4 - 42C AMD zen ? 43C AMD bt1 - 44C AMD bt2 - 45C Intel P4 - 46C Intel PNR - 47C Intel NHM - 48C Intel SBR - 49C Intel IBR - 50C Intel HWL - 51C Intel BWL ? 52C Intel SKL ? 53C Intel atom - 54C Intel SLM - 55C VIA nano - 56 57define(`rp', `%rdi') dnl rcx 58define(`up', `%rsi') dnl rdx 59define(`n_param', `%rdx') dnl r8 60define(`v0_param',`%rcx') dnl r9 61 62define(`n', `%rcx') dnl 63define(`v0', `%rdx') dnl 64 65C Testing mechanism for running this on older AMD64 processors 66ifelse(FAKE_MULXADX,1,` 67 include(CONFIG_TOP_SRCDIR`/mpn/x86_64/missing-call.m4') 68',` 69 define(`adox', ``adox' $1, $2') 70 define(`adcx', ``adcx' $1, $2') 71 define(`mulx', ``mulx' $1, $2, $3') 72') 73 74ASM_START() 75 TEXT 76 ALIGN(16) 77PROLOGUE(mpn_addmul_1) 78 mov (up), %r8 79 80 push %rbx 81 push %r12 82 push %r13 83 84 lea (up,n_param,8), up 85 lea -16(rp,n_param,8), rp 86 mov R32(n_param), R32(%rax) 87 xchg v0_param, v0 C FIXME: is this insn fast? 88 89 neg n 90 91 and $3, R8(%rax) 92 jz L(b0) 93 cmp $2, R8(%rax) 94 jl L(b1) 95 jz L(b2) 96 97L(b3): mulx( (up,n,8), %r11, %r10) 98 mulx( 8(up,n,8), %r13, %r12) 99 mulx( 16(up,n,8), %rbx, %rax) 100 dec n 101 jmp L(lo3) 102 103L(b0): mulx( (up,n,8), %r9, %r8) 104 mulx( 8(up,n,8), %r11, %r10) 105 mulx( 16(up,n,8), %r13, %r12) 106 jmp L(lo0) 107 108L(b2): mulx( (up,n,8), %r13, %r12) 109 mulx( 8(up,n,8), %rbx, %rax) 110 lea 2(n), n 111 jrcxz L(wd2) 112L(gt2): mulx( (up,n,8), %r9, %r8) 113 jmp L(lo2) 114 115L(b1): and R8(%rax), R8(%rax) 116 mulx( (up,n,8), %rbx, %rax) 117 lea 1(n), n 118 jrcxz L(wd1) 119 mulx( (up,n,8), %r9, %r8) 120 mulx( 8(up,n,8), %r11, %r10) 121 jmp L(lo1) 122 123L(end): adcx( %r10, %r13) 124 mov %r11, -8(rp) 125L(wd2): adox( (rp), %r13) 126 adcx( %r12, %rbx) 127 mov %r13, (rp) 128L(wd1): adox( 8(rp), %rbx) 129 adcx( %rcx, %rax) 130 adox( %rcx, %rax) 131 mov %rbx, 8(rp) 132 pop %r13 133 pop %r12 134 pop %rbx 135 ret 136 137L(top): jrcxz L(end) 138 mulx( (up,n,8), %r9, %r8) 139 adcx( %r10, %r13) 140 mov %r11, -8(rp,n,8) 141L(lo2): adox( (rp,n,8), %r13) 142 mulx( 8(up,n,8), %r11, %r10) 143 adcx( %r12, %rbx) 144 mov %r13, (rp,n,8) 145L(lo1): adox( 8(rp,n,8), %rbx) 146 mulx( 16(up,n,8), %r13, %r12) 147 adcx( %rax, %r9) 148 mov %rbx, 8(rp,n,8) 149L(lo0): adox( 16(rp,n,8), %r9) 150 mulx( 24(up,n,8), %rbx, %rax) 151 adcx( %r8, %r11) 152 mov %r9, 16(rp,n,8) 153L(lo3): adox( 24(rp,n,8), %r11) 154 lea 4(n), n 155 jmp L(top) 156EPILOGUE() 157ASM_END() 158