1dnl AMD64 mpn_sublsh1_n -- rp[] = up[] - (vp[] << 1) 2 3dnl Copyright 2003, 2005, 2006, 2007 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of the GNU Lesser General Public License as published 9dnl by the Free Software Foundation; either version 3 of the License, or (at 10dnl your option) any later version. 11 12dnl The GNU MP Library is distributed in the hope that it will be useful, but 13dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 14dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 15dnl License for more details. 16 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22 23C cycles/limb 24C K8,K9: 2.2 25C K10: 2.2 26C P4: 12.75 27C P6 core2: 3.45 28C P6 corei7: 3.45 29C P6 atom: ? 30 31 32C Sometimes speed degenerates, supposedly related to that some operand 33C alignments cause cache conflicts. 34 35C The speed is limited by decoding/issue bandwidth. There are 26 instructions 36C in the loop, which corresponds to 26/3/4 = 2.167 c/l. 37 38C INPUT PARAMETERS 39define(`rp',`%rdi') 40define(`up',`%rsi') 41define(`vp',`%rdx') 42define(`n', `%rcx') 43 44ASM_START() 45 TEXT 46 ALIGN(16) 47PROLOGUE(mpn_sublsh1_n) 48 push %rbx 49 push %rbp 50 51 mov (vp), %r8 52 mov R32(n), R32(%rax) 53 lea (rp,n,8), rp 54 lea (up,n,8), up 55 lea (vp,n,8), vp 56 neg n 57 xor R32(%rbp), R32(%rbp) 58 and $3, R32(%rax) 59 je L(b00) 60 cmp $2, R32(%rax) 61 jc L(b01) 62 je L(b10) 63 64L(b11): add %r8, %r8 65 mov 8(vp,n,8), %r9 66 adc %r9, %r9 67 mov 16(vp,n,8), %r10 68 adc %r10, %r10 69 sbb R32(%rax), R32(%rax) C save scy 70 mov (up,n,8), %rbp 71 mov 8(up,n,8), %rbx 72 sub %r8, %rbp 73 sbb %r9, %rbx 74 mov %rbp, (rp,n,8) 75 mov %rbx, 8(rp,n,8) 76 mov 16(up,n,8), %rbp 77 sbb %r10, %rbp 78 mov %rbp, 16(rp,n,8) 79 sbb R32(%rbp), R32(%rbp) C save acy 80 add $3, n 81 jmp L(ent) 82 83L(b10): add %r8, %r8 84 mov 8(vp,n,8), %r9 85 adc %r9, %r9 86 sbb R32(%rax), R32(%rax) C save scy 87 mov (up,n,8), %rbp 88 mov 8(up,n,8), %rbx 89 sub %r8, %rbp 90 sbb %r9, %rbx 91 mov %rbp, (rp,n,8) 92 mov %rbx, 8(rp,n,8) 93 sbb R32(%rbp), R32(%rbp) C save acy 94 add $2, n 95 jmp L(ent) 96 97L(b01): add %r8, %r8 98 sbb R32(%rax), R32(%rax) C save scy 99 mov (up,n,8), %rbp 100 sub %r8, %rbp 101 mov %rbp, (rp,n,8) 102 sbb R32(%rbp), R32(%rbp) C save acy 103 inc n 104L(ent): jns L(end) 105 106 ALIGN(16) 107L(top): add R32(%rax), R32(%rax) C restore scy 108 109 mov (vp,n,8), %r8 110L(b00): adc %r8, %r8 111 mov 8(vp,n,8), %r9 112 adc %r9, %r9 113 mov 16(vp,n,8), %r10 114 adc %r10, %r10 115 mov 24(vp,n,8), %r11 116 adc %r11, %r11 117 118 sbb R32(%rax), R32(%rax) C save scy 119 add R32(%rbp), R32(%rbp) C restore acy 120 121 mov (up,n,8), %rbp 122 mov 8(up,n,8), %rbx 123 sbb %r8, %rbp 124 sbb %r9, %rbx 125 mov %rbp, (rp,n,8) 126 mov %rbx, 8(rp,n,8) 127 mov 16(up,n,8), %rbp 128 mov 24(up,n,8), %rbx 129 sbb %r10, %rbp 130 sbb %r11, %rbx 131 mov %rbp, 16(rp,n,8) 132 mov %rbx, 24(rp,n,8) 133 134 sbb R32(%rbp), R32(%rbp) C save acy 135 add $4, n 136 js L(top) 137 138L(end): add R32(%rbp), R32(%rax) 139 neg R32(%rax) 140 141 pop %rbp 142 pop %rbx 143 ret 144EPILOGUE() 145