1dnl AMD64 mpn_mul_1. 2 3dnl Copyright 2003-2005, 2007, 2008, 2012 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31include(`../config.m4') 32 33C cycles/limb 34C AMD K8,K9 2.5 35C AMD K10 2.5 36C AMD bd1 5.0 37C AMD bobcat 5.5 38C Intel P4 12.3 39C Intel core2 4.0 40C Intel NHM 3.75 41C Intel SBR 2.95 42C Intel atom 19.8 43C VIA nano 4.25 44 45C The loop of this code is the result of running a code generation and 46C optimization tool suite written by David Harvey and Torbjorn Granlund. 47 48C TODO 49C * The loop is great, but the prologue and epilogue code was quickly written. 50C Tune it! 51 52define(`rp', `%rdi') C rcx 53define(`up', `%rsi') C rdx 54define(`n_param', `%rdx') C r8 55define(`vl', `%rcx') C r9 56 57define(`n', `%r11') 58 59ABI_SUPPORT(DOS64) 60ABI_SUPPORT(STD64) 61 62IFDOS(` define(`up', ``%rsi'') ') dnl 63IFDOS(` define(`rp', ``%rcx'') ') dnl 64IFDOS(` define(`vl', ``%r9'') ') dnl 65IFDOS(` define(`r9', ``rdi'') ') dnl 66IFDOS(` define(`n', ``%r8'') ') dnl 67IFDOS(` define(`r8', ``r11'') ') dnl 68 69ASM_START() 70 TEXT 71 ALIGN(16) 72PROLOGUE(mpn_mul_1c) 73IFDOS(``push %rsi '') 74IFDOS(``push %rdi '') 75IFDOS(``mov %rdx, %rsi '') 76 push %rbx 77IFSTD(` mov %r8, %r10') 78IFDOS(` mov 64(%rsp), %r10') C 40 + 3*8 (3 push insns) 79 jmp L(common) 80EPILOGUE() 81 82PROLOGUE(mpn_mul_1) 83IFDOS(``push %rsi '') 84IFDOS(``push %rdi '') 85IFDOS(``mov %rdx, %rsi '') 86 87 push %rbx 88 xor %r10, %r10 89L(common): 90 mov (up), %rax C read first u limb early 91IFSTD(` mov n_param, %rbx ') C move away n from rdx, mul uses it 92IFDOS(` mov n, %rbx ') 93 mul vl 94IFSTD(` mov %rbx, n ') 95 96 add %r10, %rax 97 adc $0, %rdx 98 99 and $3, R32(%rbx) 100 jz L(b0) 101 cmp $2, R32(%rbx) 102 jz L(b2) 103 jg L(b3) 104 105L(b1): dec n 106 jne L(gt1) 107 mov %rax, (rp) 108 jmp L(ret) 109L(gt1): lea 8(up,n,8), up 110 lea -8(rp,n,8), rp 111 neg n 112 xor %r10, %r10 113 xor R32(%rbx), R32(%rbx) 114 mov %rax, %r9 115 mov (up,n,8), %rax 116 mov %rdx, %r8 117 jmp L(L1) 118 119L(b0): lea (up,n,8), up 120 lea -16(rp,n,8), rp 121 neg n 122 xor %r10, %r10 123 mov %rax, %r8 124 mov %rdx, %rbx 125 jmp L(L0) 126 127L(b3): lea -8(up,n,8), up 128 lea -24(rp,n,8), rp 129 neg n 130 mov %rax, %rbx 131 mov %rdx, %r10 132 jmp L(L3) 133 134L(b2): lea -16(up,n,8), up 135 lea -32(rp,n,8), rp 136 neg n 137 xor %r8, %r8 138 xor R32(%rbx), R32(%rbx) 139 mov %rax, %r10 140 mov 24(up,n,8), %rax 141 mov %rdx, %r9 142 jmp L(L2) 143 144 ALIGN(16) 145L(top): mov %r10, (rp,n,8) 146 add %rax, %r9 147 mov (up,n,8), %rax 148 adc %rdx, %r8 149 mov $0, R32(%r10) 150L(L1): mul vl 151 mov %r9, 8(rp,n,8) 152 add %rax, %r8 153 adc %rdx, %rbx 154L(L0): mov 8(up,n,8), %rax 155 mul vl 156 mov %r8, 16(rp,n,8) 157 add %rax, %rbx 158 adc %rdx, %r10 159L(L3): mov 16(up,n,8), %rax 160 mul vl 161 mov %rbx, 24(rp,n,8) 162 mov $0, R32(%r8) C zero 163 mov %r8, %rbx C zero 164 add %rax, %r10 165 mov 24(up,n,8), %rax 166 mov %r8, %r9 C zero 167 adc %rdx, %r9 168L(L2): mul vl 169 add $4, n 170 js L(top) 171 172 mov %r10, (rp,n,8) 173 add %rax, %r9 174 adc %r8, %rdx 175 mov %r9, 8(rp,n,8) 176 add %r8, %rdx 177L(ret): mov %rdx, %rax 178 179 pop %rbx 180IFDOS(``pop %rdi '') 181IFDOS(``pop %rsi '') 182 ret 183EPILOGUE() 184