1dnl x86-64 mpn_rshift optimized for Pentium 4. 2 3dnl Copyright 2003, 2005, 2007, 2008, 2012 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of the GNU Lesser General Public License as published 9dnl by the Free Software Foundation; either version 3 of the License, or (at 10dnl your option) any later version. 11 12dnl The GNU MP Library is distributed in the hope that it will be useful, but 13dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 14dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 15dnl License for more details. 16 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22 23C cycles/limb 24C AMD K8,K9 2.5 25C AMD K10 ? 26C Intel P4 3.29 27C Intel core2 2.1 (fluctuates, presumably cache related) 28C Intel corei ? 29C Intel atom 14.3 30C VIA nano ? 31 32C INPUT PARAMETERS 33define(`rp',`%rdi') 34define(`up',`%rsi') 35define(`n',`%rdx') 36define(`cnt',`%cl') 37 38ABI_SUPPORT(DOS64) 39ABI_SUPPORT(STD64) 40 41ASM_START() 42 TEXT 43 ALIGN(32) 44PROLOGUE(mpn_rshift) 45 FUNC_ENTRY(4) 46 mov (up), %rax 47 movd R32(%rcx), %mm4 48 neg R32(%rcx) C put lsh count in cl 49 and $63, R32(%rcx) 50 movd R32(%rcx), %mm5 51 52 lea -8(up,n,8), up 53 lea -8(rp,n,8), rp 54 lea 1(n), R32(%r8) 55 neg n 56 57 shl R8(%rcx), %rax C function return value 58 59 and $3, R32(%r8) 60 je L(rol) C jump for n = 3, 7, 11, ... 61 62 dec R32(%r8) 63 jne L(1) 64C n = 4, 8, 12, ... 65 movq 8(up,n,8), %mm2 66 psrlq %mm4, %mm2 67 movq 16(up,n,8), %mm0 68 psllq %mm5, %mm0 69 por %mm0, %mm2 70 movq %mm2, 8(rp,n,8) 71 inc n 72 jmp L(rol) 73 74L(1): dec R32(%r8) 75 je L(1x) C jump for n = 1, 5, 9, 13, ... 76C n = 2, 6, 10, 16, ... 77 movq 8(up,n,8), %mm2 78 psrlq %mm4, %mm2 79 movq 16(up,n,8), %mm0 80 psllq %mm5, %mm0 81 por %mm0, %mm2 82 movq %mm2, 8(rp,n,8) 83 inc n 84L(1x): 85 cmp $-1, n 86 je L(ast) 87 movq 8(up,n,8), %mm2 88 psrlq %mm4, %mm2 89 movq 16(up,n,8), %mm3 90 psrlq %mm4, %mm3 91 movq 16(up,n,8), %mm0 92 movq 24(up,n,8), %mm1 93 psllq %mm5, %mm0 94 por %mm0, %mm2 95 psllq %mm5, %mm1 96 por %mm1, %mm3 97 movq %mm2, 8(rp,n,8) 98 movq %mm3, 16(rp,n,8) 99 add $2, n 100 101L(rol): movq 8(up,n,8), %mm2 102 psrlq %mm4, %mm2 103 movq 16(up,n,8), %mm3 104 psrlq %mm4, %mm3 105 106 add $4, n C 4 107 jb L(end) C 2 108 ALIGN(32) 109L(top): 110 C finish stuff from lsh block 111 movq -16(up,n,8), %mm0 112 movq -8(up,n,8), %mm1 113 psllq %mm5, %mm0 114 por %mm0, %mm2 115 psllq %mm5, %mm1 116 movq (up,n,8), %mm0 117 por %mm1, %mm3 118 movq 8(up,n,8), %mm1 119 movq %mm2, -24(rp,n,8) 120 movq %mm3, -16(rp,n,8) 121 C start two new rsh 122 psllq %mm5, %mm0 123 psllq %mm5, %mm1 124 125 C finish stuff from rsh block 126 movq -8(up,n,8), %mm2 127 movq (up,n,8), %mm3 128 psrlq %mm4, %mm2 129 por %mm2, %mm0 130 psrlq %mm4, %mm3 131 movq 8(up,n,8), %mm2 132 por %mm3, %mm1 133 movq 16(up,n,8), %mm3 134 movq %mm0, -8(rp,n,8) 135 movq %mm1, (rp,n,8) 136 C start two new lsh 137 add $4, n 138 psrlq %mm4, %mm2 139 psrlq %mm4, %mm3 140 141 jae L(top) C 2 142L(end): 143 movq -8(up), %mm0 144 psllq %mm5, %mm0 145 por %mm0, %mm2 146 movq (up), %mm1 147 psllq %mm5, %mm1 148 por %mm1, %mm3 149 movq %mm2, -16(rp) 150 movq %mm3, -8(rp) 151 152L(ast): movq (up), %mm2 153 psrlq %mm4, %mm2 154 movq %mm2, (rp) 155 emms 156 FUNC_EXIT() 157 ret 158EPILOGUE() 159