1dnl AMD64 mpn_rshift -- mpn right shift, optimised for Atom. 2 3dnl Contributed to the GNU project by Torbjorn Granlund. 4 5dnl Copyright 2011, 2012 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of the GNU Lesser General Public License as published 11dnl by the Free Software Foundation; either version 3 of the License, or (at 12dnl your option) any later version. 13 14dnl The GNU MP Library is distributed in the hope that it will be useful, but 15dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 16dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 17dnl License for more details. 18 19dnl You should have received a copy of the GNU Lesser General Public License 20dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 21 22include(`../config.m4') 23 24C cycles/limb 25C AMD K8,K9 ? 26C AMD K10 ? 27C Intel P4 ? 28C Intel core2 ? 29C Intel NHM ? 30C Intel SBR ? 31C Intel atom 4.5 32C VIA nano ? 33 34C TODO 35C * Consider using 4-way unrolling. We reach 4 c/l, but the code is 2.5 times 36C larger. 37 38C INPUT PARAMETERS 39define(`rp', `%rdi') 40define(`up', `%rsi') 41define(`n', `%rdx') 42define(`cnt', `%rcx') 43 44ABI_SUPPORT(DOS64) 45ABI_SUPPORT(STD64) 46 47ASM_START() 48 TEXT 49 ALIGN(16) 50PROLOGUE(mpn_rshift) 51 FUNC_ENTRY(4) 52 shr R32(n) 53 mov (up), %rax 54 jnc L(evn) 55 56 mov %rax, %r11 57 shr R8(cnt), %r11 58 neg R8(cnt) 59 shl R8(cnt), %rax 60 test n, n 61 jnz L(gt1) 62 mov %r11, (rp) 63 FUNC_EXIT() 64 ret 65 66L(gt1): mov 8(up), %r8 67 mov %r8, %r10 68 shl R8(cnt), %r8 69 jmp L(lo1) 70 71L(evn): mov %rax, %r10 72 neg R8(cnt) 73 shl R8(cnt), %rax 74 mov 8(up), %r9 75 mov %r9, %r11 76 shl R8(cnt), %r9 77 neg R8(cnt) 78 dec n 79 lea -8(rp), rp 80 lea 8(up), up 81 jz L(end) 82 83 ALIGN(8) 84L(top): shr R8(cnt), %r10 85 or %r10, %r9 86 shr R8(cnt), %r11 87 neg R8(cnt) 88 mov 8(up), %r8 89 mov %r8, %r10 90 mov %r9, 8(rp) 91 shl R8(cnt), %r8 92 lea 16(rp), rp 93L(lo1): mov 16(up), %r9 94 or %r11, %r8 95 mov %r9, %r11 96 shl R8(cnt), %r9 97 lea 16(up), up 98 neg R8(cnt) 99 mov %r8, (rp) 100 dec n 101 jg L(top) 102 103L(end): shr R8(cnt), %r10 104 or %r10, %r9 105 shr R8(cnt), %r11 106 mov %r9, 8(rp) 107 mov %r11, 16(rp) 108 FUNC_EXIT() 109 ret 110EPILOGUE() 111