1dnl AMD64 mpn_sublsh1_n -- rp[] = up[] - (vp[] << 1) 2 3dnl Copyright 2003, 2005, 2006, 2007, 2011, 2012 Free Software Foundation, 4dnl Inc. 5 6dnl This file is part of the GNU MP Library. 7 8dnl The GNU MP Library is free software; you can redistribute it and/or modify 9dnl it under the terms of the GNU Lesser General Public License as published 10dnl by the Free Software Foundation; either version 3 of the License, or (at 11dnl your option) any later version. 12 13dnl The GNU MP Library is distributed in the hope that it will be useful, but 14dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 15dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 16dnl License for more details. 17 18dnl You should have received a copy of the GNU Lesser General Public License 19dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 20 21include(`../config.m4') 22 23 24C cycles/limb 25C AMD K8,K9 2.2 26C AMD K10 2.2 27C Intel P4 12.75 28C Intel core2 3.45 29C Intel corei ? 30C Intel atom ? 31C VIA nano 3.25 32 33C Sometimes speed degenerates, supposedly related to that some operand 34C alignments cause cache conflicts. 35 36C The speed is limited by decoding/issue bandwidth. There are 26 instructions 37C in the loop, which corresponds to 26/3/4 = 2.167 c/l. 38 39C INPUT PARAMETERS 40define(`rp',`%rdi') 41define(`up',`%rsi') 42define(`vp',`%rdx') 43define(`n', `%rcx') 44 45ABI_SUPPORT(DOS64) 46ABI_SUPPORT(STD64) 47 48ASM_START() 49 TEXT 50 ALIGN(16) 51PROLOGUE(mpn_sublsh1_n) 52 FUNC_ENTRY(4) 53 push %rbx 54 push %rbp 55 56 mov (vp), %r8 57 mov R32(n), R32(%rax) 58 lea (rp,n,8), rp 59 lea (up,n,8), up 60 lea (vp,n,8), vp 61 neg n 62 xor R32(%rbp), R32(%rbp) 63 and $3, R32(%rax) 64 je L(b00) 65 cmp $2, R32(%rax) 66 jc L(b01) 67 je L(b10) 68 69L(b11): add %r8, %r8 70 mov 8(vp,n,8), %r9 71 adc %r9, %r9 72 mov 16(vp,n,8), %r10 73 adc %r10, %r10 74 sbb R32(%rax), R32(%rax) C save scy 75 mov (up,n,8), %rbp 76 mov 8(up,n,8), %rbx 77 sub %r8, %rbp 78 sbb %r9, %rbx 79 mov %rbp, (rp,n,8) 80 mov %rbx, 8(rp,n,8) 81 mov 16(up,n,8), %rbp 82 sbb %r10, %rbp 83 mov %rbp, 16(rp,n,8) 84 sbb R32(%rbp), R32(%rbp) C save acy 85 add $3, n 86 jmp L(ent) 87 88L(b10): add %r8, %r8 89 mov 8(vp,n,8), %r9 90 adc %r9, %r9 91 sbb R32(%rax), R32(%rax) C save scy 92 mov (up,n,8), %rbp 93 mov 8(up,n,8), %rbx 94 sub %r8, %rbp 95 sbb %r9, %rbx 96 mov %rbp, (rp,n,8) 97 mov %rbx, 8(rp,n,8) 98 sbb R32(%rbp), R32(%rbp) C save acy 99 add $2, n 100 jmp L(ent) 101 102L(b01): add %r8, %r8 103 sbb R32(%rax), R32(%rax) C save scy 104 mov (up,n,8), %rbp 105 sub %r8, %rbp 106 mov %rbp, (rp,n,8) 107 sbb R32(%rbp), R32(%rbp) C save acy 108 inc n 109L(ent): jns L(end) 110 111 ALIGN(16) 112L(top): add R32(%rax), R32(%rax) C restore scy 113 114 mov (vp,n,8), %r8 115L(b00): adc %r8, %r8 116 mov 8(vp,n,8), %r9 117 adc %r9, %r9 118 mov 16(vp,n,8), %r10 119 adc %r10, %r10 120 mov 24(vp,n,8), %r11 121 adc %r11, %r11 122 123 sbb R32(%rax), R32(%rax) C save scy 124 add R32(%rbp), R32(%rbp) C restore acy 125 126 mov (up,n,8), %rbp 127 mov 8(up,n,8), %rbx 128 sbb %r8, %rbp 129 sbb %r9, %rbx 130 mov %rbp, (rp,n,8) 131 mov %rbx, 8(rp,n,8) 132 mov 16(up,n,8), %rbp 133 mov 24(up,n,8), %rbx 134 sbb %r10, %rbp 135 sbb %r11, %rbx 136 mov %rbp, 16(rp,n,8) 137 mov %rbx, 24(rp,n,8) 138 139 sbb R32(%rbp), R32(%rbp) C save acy 140 add $4, n 141 js L(top) 142 143L(end): add R32(%rbp), R32(%rax) 144 neg R32(%rax) 145 146 pop %rbp 147 pop %rbx 148 FUNC_EXIT() 149 ret 150EPILOGUE() 151