1dnl x86-64 mpn_lshift optimized for Pentium 4. 2 3dnl Copyright 2003, 2005, 2007, 2008, 2012 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of the GNU Lesser General Public License as published 9dnl by the Free Software Foundation; either version 3 of the License, or (at 10dnl your option) any later version. 11 12dnl The GNU MP Library is distributed in the hope that it will be useful, but 13dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 14dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 15dnl License for more details. 16 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22 23C cycles/limb 24C AMD K8,K9 2.5 25C AMD K10 ? 26C Intel P4 3.29 27C Intel core2 2.1 (fluctuates, presumably cache related) 28C Intel corei ? 29C Intel atom 14.3 30C VIA nano ? 31 32C INPUT PARAMETERS 33define(`rp',`%rdi') 34define(`up',`%rsi') 35define(`n',`%rdx') 36define(`cnt',`%cl') 37 38ABI_SUPPORT(DOS64) 39ABI_SUPPORT(STD64) 40 41ASM_START() 42 TEXT 43 ALIGN(32) 44PROLOGUE(mpn_lshift) 45 FUNC_ENTRY(4) 46 mov -8(up,n,8), %rax 47 movd R32(%rcx), %mm4 48 neg R32(%rcx) C put rsh count in cl 49 and $63, R32(%rcx) 50 movd R32(%rcx), %mm5 51 52 lea 1(n), R32(%r8) 53 54 shr R8(%rcx), %rax C function return value 55 56 and $3, R32(%r8) 57 je L(rol) C jump for n = 3, 7, 11, ... 58 59 dec R32(%r8) 60 jne L(1) 61C n = 4, 8, 12, ... 62 movq -8(up,n,8), %mm2 63 psllq %mm4, %mm2 64 movq -16(up,n,8), %mm0 65 psrlq %mm5, %mm0 66 por %mm0, %mm2 67 movq %mm2, -8(rp,n,8) 68 dec n 69 jmp L(rol) 70 71L(1): dec R32(%r8) 72 je L(1x) C jump for n = 1, 5, 9, 13, ... 73C n = 2, 6, 10, 16, ... 74 movq -8(up,n,8), %mm2 75 psllq %mm4, %mm2 76 movq -16(up,n,8), %mm0 77 psrlq %mm5, %mm0 78 por %mm0, %mm2 79 movq %mm2, -8(rp,n,8) 80 dec n 81L(1x): 82 cmp $1, n 83 je L(ast) 84 movq -8(up,n,8), %mm2 85 psllq %mm4, %mm2 86 movq -16(up,n,8), %mm3 87 psllq %mm4, %mm3 88 movq -16(up,n,8), %mm0 89 movq -24(up,n,8), %mm1 90 psrlq %mm5, %mm0 91 por %mm0, %mm2 92 psrlq %mm5, %mm1 93 por %mm1, %mm3 94 movq %mm2, -8(rp,n,8) 95 movq %mm3, -16(rp,n,8) 96 sub $2, n 97 98L(rol): movq -8(up,n,8), %mm2 99 psllq %mm4, %mm2 100 movq -16(up,n,8), %mm3 101 psllq %mm4, %mm3 102 103 sub $4, n C 4 104 jb L(end) C 2 105 ALIGN(32) 106L(top): 107 C finish stuff from lsh block 108 movq 16(up,n,8), %mm0 109 movq 8(up,n,8), %mm1 110 psrlq %mm5, %mm0 111 por %mm0, %mm2 112 psrlq %mm5, %mm1 113 movq (up,n,8), %mm0 114 por %mm1, %mm3 115 movq -8(up,n,8), %mm1 116 movq %mm2, 24(rp,n,8) 117 movq %mm3, 16(rp,n,8) 118 C start two new rsh 119 psrlq %mm5, %mm0 120 psrlq %mm5, %mm1 121 122 C finish stuff from rsh block 123 movq 8(up,n,8), %mm2 124 movq (up,n,8), %mm3 125 psllq %mm4, %mm2 126 por %mm2, %mm0 127 psllq %mm4, %mm3 128 movq -8(up,n,8), %mm2 129 por %mm3, %mm1 130 movq -16(up,n,8), %mm3 131 movq %mm0, 8(rp,n,8) 132 movq %mm1, (rp,n,8) 133 C start two new lsh 134 sub $4, n 135 psllq %mm4, %mm2 136 psllq %mm4, %mm3 137 138 jae L(top) C 2 139L(end): 140 movq 8(up), %mm0 141 psrlq %mm5, %mm0 142 por %mm0, %mm2 143 movq (up), %mm1 144 psrlq %mm5, %mm1 145 por %mm1, %mm3 146 movq %mm2, 16(rp) 147 movq %mm3, 8(rp) 148 149L(ast): movq (up), %mm2 150 psllq %mm4, %mm2 151 movq %mm2, (rp) 152 emms 153 FUNC_EXIT() 154 ret 155EPILOGUE() 156