1dnl AMD64 mpn_addlsh_n and mpn_rsblsh_n. R = V2^k +- U. 2 3dnl Copyright 2006, 2010, 2011, 2012 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of the GNU Lesser General Public License as published 9dnl by the Free Software Foundation; either version 3 of the License, or (at 10dnl your option) any later version. 11 12dnl The GNU MP Library is distributed in the hope that it will be useful, but 13dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 14dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 15dnl License for more details. 16 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22 23C cycles/limb 24C AMD K8,K9 3.1 < 3.85 for lshift + add_n 25C AMD K10 3.1 < 3.85 for lshift + add_n 26C Intel P4 14.6 > 7.33 for lshift + add_n 27C Intel core2 3.87 > 3.27 for lshift + add_n 28C Intel NHM 4 > 3.75 for lshift + add_n 29C Intel SBR (5.8) > 3.46 for lshift + add_n 30C Intel atom (7.75) < 8.75 for lshift + add_n 31C VIA nano 4.7 < 6.25 for lshift + add_n 32 33C This was written quickly and not optimized at all. Surely one could get 34C closer to 3 c/l or perhaps even under 3 c/l. Ideas: 35C 1) Use indexing to save the 3 LEA 36C 2) Write reasonable feed-in code 37C 3) Be more clever about register usage 38C 4) Unroll more, handling CL negation, carry save/restore cost much now 39C 5) Reschedule 40 41C INPUT PARAMETERS 42define(`rp', `%rdi') 43define(`up', `%rsi') 44define(`vp', `%rdx') 45define(`n', `%rcx') 46define(`cnt', `%r8') 47 48ifdef(`OPERATION_addlsh_n',` 49 define(ADCSBB, `adc') 50 define(func, mpn_addlsh_n) 51') 52ifdef(`OPERATION_rsblsh_n',` 53 define(ADCSBB, `sbb') 54 define(func, mpn_rsblsh_n) 55') 56 57MULFUNC_PROLOGUE(mpn_addlsh_n mpn_rsblsh_n) 58 59ABI_SUPPORT(DOS64) 60ABI_SUPPORT(STD64) 61 62ASM_START() 63 TEXT 64 ALIGN(16) 65PROLOGUE(func) 66 FUNC_ENTRY(4) 67IFDOS(` mov 56(%rsp), %r8d ') 68 push %r12 69 push %r13 70 push %r14 71 push %rbp 72 push %rbx 73 74 mov n, %rax 75 xor R32(%rbx), R32(%rbx) C clear carry save register 76 mov R32(%r8), R32(%rcx) C shift count 77 xor R32(%rbp), R32(%rbp) C limb carry 78 79 mov R32(%rax), R32(%r11) 80 and $3, R32(%r11) 81 je L(4) 82 sub $1, R32(%r11) 83 84L(012): mov (vp), %r8 85 mov %r8, %r12 86 shl R8(%rcx), %r8 87 or %rbp, %r8 88 neg R8(%rcx) 89 mov %r12, %rbp 90 shr R8(%rcx), %rbp 91 neg R8(%rcx) 92 add R32(%rbx), R32(%rbx) 93 ADCSBB (up), %r8 94 mov %r8, (rp) 95 sbb R32(%rbx), R32(%rbx) 96 lea 8(up), up 97 lea 8(vp), vp 98 lea 8(rp), rp 99 sub $1, R32(%r11) 100 jnc L(012) 101 102L(4): sub $4, %rax 103 jc L(end) 104 105 ALIGN(16) 106L(top): mov (vp), %r8 107 mov %r8, %r12 108 mov 8(vp), %r9 109 mov %r9, %r13 110 mov 16(vp), %r10 111 mov %r10, %r14 112 mov 24(vp), %r11 113 114 shl R8(%rcx), %r8 115 shl R8(%rcx), %r9 116 shl R8(%rcx), %r10 117 or %rbp, %r8 118 mov %r11, %rbp 119 shl R8(%rcx), %r11 120 121 neg R8(%rcx) 122 123 shr R8(%rcx), %r12 124 shr R8(%rcx), %r13 125 shr R8(%rcx), %r14 126 shr R8(%rcx), %rbp C used next iteration 127 128 or %r12, %r9 129 or %r13, %r10 130 or %r14, %r11 131 132 neg R8(%rcx) 133 134 add R32(%rbx), R32(%rbx) C restore carry flag 135 136 ADCSBB (up), %r8 137 ADCSBB 8(up), %r9 138 ADCSBB 16(up), %r10 139 ADCSBB 24(up), %r11 140 141 mov %r8, (rp) 142 mov %r9, 8(rp) 143 mov %r10, 16(rp) 144 mov %r11, 24(rp) 145 146 sbb R32(%rbx), R32(%rbx) C save carry flag 147 148 lea 32(up), up 149 lea 32(vp), vp 150 lea 32(rp), rp 151 152 sub $4, %rax 153 jnc L(top) 154 155L(end): add R32(%rbx), R32(%rbx) 156 ADCSBB $0, %rbp 157 mov %rbp, %rax 158 pop %rbx 159 pop %rbp 160 pop %r14 161 pop %r13 162 pop %r12 163 FUNC_EXIT() 164 ret 165EPILOGUE() 166