1dnl AMD64 mpn_addlsh1_n -- rp[] = up[] + (vp[] << 1) 2dnl AMD64 mpn_rsblsh1_n -- rp[] = (vp[] << 1) - up[] 3 4dnl Copyright 2003, 2005, 2006, 2007, 2008, 2009 Free Software Foundation, Inc. 5 6dnl This file is part of the GNU MP Library. 7 8dnl The GNU MP Library is free software; you can redistribute it and/or modify 9dnl it under the terms of the GNU Lesser General Public License as published 10dnl by the Free Software Foundation; either version 3 of the License, or (at 11dnl your option) any later version. 12 13dnl The GNU MP Library is distributed in the hope that it will be useful, but 14dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 15dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 16dnl License for more details. 17 18dnl You should have received a copy of the GNU Lesser General Public License 19dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 20 21include(`../config.m4') 22 23 24C cycles/limb 25C K8,K9: 2 26C K10: 2 27C P4: 13 28C P6 core2: 3.45 29C P6 corei7: 3.45 30C P6 atom: ? 31 32 33C Sometimes speed degenerates, supposedly related to that some operand 34C alignments cause cache conflicts. 35 36C The speed is limited by decoding/issue bandwidth. There are 22 instructions 37C in the loop, which corresponds to ceil(22/3)/4 = 1.83 c/l. 38 39C INPUT PARAMETERS 40define(`rp',`%rdi') 41define(`up',`%rsi') 42define(`vp',`%rdx') 43define(`n', `%rcx') 44 45ifdef(`OPERATION_addlsh1_n', ` 46 define(ADDSUB, add) 47 define(ADCSBB, adc) 48 define(func, mpn_addlsh1_n)') 49ifdef(`OPERATION_rsblsh1_n', ` 50 define(ADDSUB, sub) 51 define(ADCSBB, sbb) 52 define(func, mpn_rsblsh1_n)') 53 54MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_rsblsh1_n) 55 56ASM_START() 57 TEXT 58 ALIGN(16) 59PROLOGUE(func) 60 push %rbp 61 62 mov (vp), %r8 63 mov R32(n), R32(%rax) 64 lea (rp,n,8), rp 65 lea (up,n,8), up 66 lea (vp,n,8), vp 67 neg n 68 xor R32(%rbp), R32(%rbp) 69 and $3, R32(%rax) 70 je L(b00) 71 cmp $2, R32(%rax) 72 jc L(b01) 73 je L(b10) 74 75L(b11): add %r8, %r8 76 mov 8(vp,n,8), %r9 77 adc %r9, %r9 78 mov 16(vp,n,8), %r10 79 adc %r10, %r10 80 sbb R32(%rax), R32(%rax) C save scy 81 ADDSUB (up,n,8), %r8 82 ADCSBB 8(up,n,8), %r9 83 mov %r8, (rp,n,8) 84 mov %r9, 8(rp,n,8) 85 ADCSBB 16(up,n,8), %r10 86 mov %r10, 16(rp,n,8) 87 sbb R32(%rbp), R32(%rbp) C save acy 88 add $3, n 89 jmp L(ent) 90 91L(b10): add %r8, %r8 92 mov 8(vp,n,8), %r9 93 adc %r9, %r9 94 sbb R32(%rax), R32(%rax) C save scy 95 ADDSUB (up,n,8), %r8 96 ADCSBB 8(up,n,8), %r9 97 mov %r8, (rp,n,8) 98 mov %r9, 8(rp,n,8) 99 sbb R32(%rbp), R32(%rbp) C save acy 100 add $2, n 101 jmp L(ent) 102 103L(b01): add %r8, %r8 104 sbb R32(%rax), R32(%rax) C save scy 105 ADDSUB (up,n,8), %r8 106 mov %r8, (rp,n,8) 107 sbb R32(%rbp), R32(%rbp) C save acy 108 inc n 109L(ent): jns L(end) 110 111 ALIGN(16) 112L(top): add R32(%rax), R32(%rax) C restore scy 113 114 mov (vp,n,8), %r8 115L(b00): adc %r8, %r8 116 mov 8(vp,n,8), %r9 117 adc %r9, %r9 118 mov 16(vp,n,8), %r10 119 adc %r10, %r10 120 mov 24(vp,n,8), %r11 121 adc %r11, %r11 122 123 sbb R32(%rax), R32(%rax) C save scy 124 add R32(%rbp), R32(%rbp) C restore acy 125 126 ADCSBB (up,n,8), %r8 127 nop C Hammer speedup! 128 ADCSBB 8(up,n,8), %r9 129 mov %r8, (rp,n,8) 130 mov %r9, 8(rp,n,8) 131 ADCSBB 16(up,n,8), %r10 132 ADCSBB 24(up,n,8), %r11 133 mov %r10, 16(rp,n,8) 134 mov %r11, 24(rp,n,8) 135 136 sbb R32(%rbp), R32(%rbp) C save acy 137 add $4, n 138 js L(top) 139 140L(end): 141ifdef(`OPERATION_addlsh1_n',` 142 add R32(%rbp), R32(%rax) 143 neg R32(%rax)') 144ifdef(`OPERATION_rsblsh1_n',` 145 sub R32(%rax), R32(%rbp) 146 movslq R32(%rbp), %rax') 147 148 pop %rbp 149 ret 150EPILOGUE() 151