1dnl AMD64 mpn_lshift optimised for CPUs with fast SSE including fast movdqu. 2 3dnl Contributed to the GNU project by Torbjorn Granlund. 4 5dnl Copyright 2010, 2011, 2012 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of the GNU Lesser General Public License as published 11dnl by the Free Software Foundation; either version 3 of the License, or (at 12dnl your option) any later version. 13 14dnl The GNU MP Library is distributed in the hope that it will be useful, but 15dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 16dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 17dnl License for more details. 18 19dnl You should have received a copy of the GNU Lesser General Public License 20dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 21 22include(`../config.m4') 23 24 25C cycles/limb cycles/limb cycles/limb good 26C aligned unaligned best seen for cpu? 27C AMD K8,K9 3 3 2.35 no, use shl/shr 28C AMD K10 1.5-1.8 1.5-1.8 1.33 yes 29C AMD bd1 1.7-1.9 1.7-1.9 1.33 yes 30C AMD bobcat 3.17 3.17 yes, bad for n < 20 31C Intel P4 4.67 4.67 2.7 no, slow movdqu 32C Intel core2 2.15 2.15 1.25 no, use shld/shrd 33C Intel NHM 1.66 1.66 1.25 no, use shld/shrd 34C Intel SBR 1.3 1.3 1.25 yes, bad for n = 4-6 35C Intel atom 11.7 11.7 4.5 no 36C VIA nano 5.7 5.95 2.0 no, slow movdqu 37 38C We try to do as many aligned 16-byte operations as possible. The top-most 39C and bottom-most writes might need 8-byte operations. 40C 41C This variant rely on fast load movdqu, and uses it even for aligned operands, 42C in order to avoid the need for two separate loops. 43C 44C TODO 45C * Could 2-limb wind-down code be simplified? 46C * Improve basecase code, using shld/shrd for SBR, discrete integer shifts 47C for other affected CPUs. 48 49C INPUT PARAMETERS 50define(`rp', `%rdi') 51define(`ap', `%rsi') 52define(`n', `%rdx') 53define(`cnt', `%rcx') 54 55ASM_START() 56 TEXT 57 ALIGN(64) 58PROLOGUE(mpn_lshift) 59 FUNC_ENTRY(4) 60 movd R32(%rcx), %xmm4 61 mov $64, R32(%rax) 62 sub R32(%rcx), R32(%rax) 63 movd R32(%rax), %xmm5 64 65 neg R32(%rcx) 66 mov -8(ap,n,8), %rax 67 shr R8(%rcx), %rax 68 69 cmp $3, n 70 jle L(bc) 71 72 lea (rp,n,8), R32(%rcx) 73 bt $3, R32(%rcx) 74 jnc L(rp_aligned) 75 76C Do one initial limb in order to make rp aligned 77 movq -8(ap,n,8), %xmm0 78 movq -16(ap,n,8), %xmm1 79 psllq %xmm4, %xmm0 80 psrlq %xmm5, %xmm1 81 por %xmm1, %xmm0 82 movq %xmm0, -8(rp,n,8) 83 dec n 84 85L(rp_aligned): 86 lea 1(n), %r8d 87 88 and $6, R32(%r8) 89 jz L(ba0) 90 cmp $4, R32(%r8) 91 jz L(ba4) 92 jc L(ba2) 93L(ba6): add $-4, n 94 jmp L(i56) 95L(ba0): add $-6, n 96 jmp L(i70) 97L(ba4): add $-2, n 98 jmp L(i34) 99L(ba2): add $-8, n 100 jle L(end) 101 102 ALIGN(16) 103L(top): movdqu 40(ap,n,8), %xmm1 104 movdqu 48(ap,n,8), %xmm0 105 psllq %xmm4, %xmm0 106 psrlq %xmm5, %xmm1 107 por %xmm1, %xmm0 108 movdqa %xmm0, 48(rp,n,8) 109L(i70): 110 movdqu 24(ap,n,8), %xmm1 111 movdqu 32(ap,n,8), %xmm0 112 psllq %xmm4, %xmm0 113 psrlq %xmm5, %xmm1 114 por %xmm1, %xmm0 115 movdqa %xmm0, 32(rp,n,8) 116L(i56): 117 movdqu 8(ap,n,8), %xmm1 118 movdqu 16(ap,n,8), %xmm0 119 psllq %xmm4, %xmm0 120 psrlq %xmm5, %xmm1 121 por %xmm1, %xmm0 122 movdqa %xmm0, 16(rp,n,8) 123L(i34): 124 movdqu -8(ap,n,8), %xmm1 125 movdqu (ap,n,8), %xmm0 126 psllq %xmm4, %xmm0 127 psrlq %xmm5, %xmm1 128 por %xmm1, %xmm0 129 movdqa %xmm0, (rp,n,8) 130 sub $8, n 131 jg L(top) 132 133L(end): bt $0, R32(n) 134 jc L(end8) 135 136 movdqu (ap), %xmm1 137 pxor %xmm0, %xmm0 138 punpcklqdq %xmm1, %xmm0 139 psllq %xmm4, %xmm1 140 psrlq %xmm5, %xmm0 141 por %xmm1, %xmm0 142 movdqa %xmm0, (rp) 143 FUNC_EXIT() 144 ret 145 146C Basecase 147 ALIGN(16) 148L(bc): dec R32(n) 149 jz L(end8) 150 151 movq (ap,n,8), %xmm1 152 movq -8(ap,n,8), %xmm0 153 psllq %xmm4, %xmm1 154 psrlq %xmm5, %xmm0 155 por %xmm1, %xmm0 156 movq %xmm0, (rp,n,8) 157 sub $2, R32(n) 158 jl L(end8) 159 movq 8(ap), %xmm1 160 movq (ap), %xmm0 161 psllq %xmm4, %xmm1 162 psrlq %xmm5, %xmm0 163 por %xmm1, %xmm0 164 movq %xmm0, 8(rp) 165 166L(end8):movq (ap), %xmm0 167 psllq %xmm4, %xmm0 168 movq %xmm0, (rp) 169 FUNC_EXIT() 170 ret 171EPILOGUE() 172