1dnl AMD64 mpn_lshiftc optimised for CPUs with fast SSE including fast movdqu. 2 3dnl Contributed to the GNU project by Torbjorn Granlund. 4 5dnl Copyright 2010, 2011, 2012 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of the GNU Lesser General Public License as published 11dnl by the Free Software Foundation; either version 3 of the License, or (at 12dnl your option) any later version. 13 14dnl The GNU MP Library is distributed in the hope that it will be useful, but 15dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 16dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 17dnl License for more details. 18 19dnl You should have received a copy of the GNU Lesser General Public License 20dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 21 22include(`../config.m4') 23 24 25C cycles/limb cycles/limb cycles/limb good 26C aligned unaligned best seen for cpu? 27C AMD K8,K9 3 3 ? no, use shl/shr 28C AMD K10 1.8-2.0 1.8-2.0 ? yes 29C AMD bd1 1.9 1.9 ? yes 30C AMD bobcat 3.67 3.67 yes, bad for n < 20 31C Intel P4 4.75 4.75 ? no, slow movdqu 32C Intel core2 2.27 2.27 ? no, use shld/shrd 33C Intel NHM 2.15 2.15 ? no, use shld/shrd 34C Intel SBR 1.45 1.45 ? yes, bad for n = 4-6 35C Intel atom 12.9 12.9 ? no 36C VIA nano 6.18 6.44 ? no, slow movdqu 37 38C We try to do as many aligned 16-byte operations as possible. The top-most 39C and bottom-most writes might need 8-byte operations. 40C 41C This variant rely on fast load movdqu, and uses it even for aligned operands, 42C in order to avoid the need for two separate loops. 43C 44C TODO 45C * Could 2-limb wind-down code be simplified? 46C * Improve basecase code, using shld/shrd for SBR, discrete integer shifts 47C for other affected CPUs. 48 49C INPUT PARAMETERS 50define(`rp', `%rdi') 51define(`ap', `%rsi') 52define(`n', `%rdx') 53define(`cnt', `%rcx') 54 55ASM_START() 56 TEXT 57 ALIGN(64) 58PROLOGUE(mpn_lshiftc) 59 FUNC_ENTRY(4) 60 movd R32(%rcx), %xmm4 61 mov $64, R32(%rax) 62 sub R32(%rcx), R32(%rax) 63 movd R32(%rax), %xmm5 64 65 neg R32(%rcx) 66 mov -8(ap,n,8), %rax 67 shr R8(%rcx), %rax 68 69 pcmpeqb %xmm3, %xmm3 C set to 111...111 70 71 cmp $3, n 72 jle L(bc) 73 74 lea (rp,n,8), R32(%rcx) 75 bt $3, R32(%rcx) 76 jnc L(rp_aligned) 77 78C Do one initial limb in order to make rp aligned 79 movq -8(ap,n,8), %xmm0 80 movq -16(ap,n,8), %xmm1 81 psllq %xmm4, %xmm0 82 psrlq %xmm5, %xmm1 83 por %xmm1, %xmm0 84 pxor %xmm3, %xmm0 85 movq %xmm0, -8(rp,n,8) 86 dec n 87 88L(rp_aligned): 89 lea 1(n), %r8d 90 91 and $6, R32(%r8) 92 jz L(ba0) 93 cmp $4, R32(%r8) 94 jz L(ba4) 95 jc L(ba2) 96L(ba6): add $-4, n 97 jmp L(i56) 98L(ba0): add $-6, n 99 jmp L(i70) 100L(ba4): add $-2, n 101 jmp L(i34) 102L(ba2): add $-8, n 103 jle L(end) 104 105 ALIGN(16) 106L(top): movdqu 40(ap,n,8), %xmm1 107 movdqu 48(ap,n,8), %xmm0 108 psllq %xmm4, %xmm0 109 psrlq %xmm5, %xmm1 110 por %xmm1, %xmm0 111 pxor %xmm3, %xmm0 112 movdqa %xmm0, 48(rp,n,8) 113L(i70): 114 movdqu 24(ap,n,8), %xmm1 115 movdqu 32(ap,n,8), %xmm0 116 psllq %xmm4, %xmm0 117 psrlq %xmm5, %xmm1 118 por %xmm1, %xmm0 119 pxor %xmm3, %xmm0 120 movdqa %xmm0, 32(rp,n,8) 121L(i56): 122 movdqu 8(ap,n,8), %xmm1 123 movdqu 16(ap,n,8), %xmm0 124 psllq %xmm4, %xmm0 125 psrlq %xmm5, %xmm1 126 por %xmm1, %xmm0 127 pxor %xmm3, %xmm0 128 movdqa %xmm0, 16(rp,n,8) 129L(i34): 130 movdqu -8(ap,n,8), %xmm1 131 movdqu (ap,n,8), %xmm0 132 psllq %xmm4, %xmm0 133 psrlq %xmm5, %xmm1 134 por %xmm1, %xmm0 135 pxor %xmm3, %xmm0 136 movdqa %xmm0, (rp,n,8) 137 sub $8, n 138 jg L(top) 139 140L(end): bt $0, R32(n) 141 jc L(end8) 142 143 movdqu (ap), %xmm1 144 pxor %xmm0, %xmm0 145 punpcklqdq %xmm1, %xmm0 146 psllq %xmm4, %xmm1 147 psrlq %xmm5, %xmm0 148 por %xmm1, %xmm0 149 pxor %xmm3, %xmm0 150 movdqa %xmm0, (rp) 151 FUNC_EXIT() 152 ret 153 154C Basecase 155 ALIGN(16) 156L(bc): dec R32(n) 157 jz L(end8) 158 159 movq (ap,n,8), %xmm1 160 movq -8(ap,n,8), %xmm0 161 psllq %xmm4, %xmm1 162 psrlq %xmm5, %xmm0 163 por %xmm1, %xmm0 164 pxor %xmm3, %xmm0 165 movq %xmm0, (rp,n,8) 166 sub $2, R32(n) 167 jl L(end8) 168 movq 8(ap), %xmm1 169 movq (ap), %xmm0 170 psllq %xmm4, %xmm1 171 psrlq %xmm5, %xmm0 172 por %xmm1, %xmm0 173 pxor %xmm3, %xmm0 174 movq %xmm0, 8(rp) 175 176L(end8):movq (ap), %xmm0 177 psllq %xmm4, %xmm0 178 pxor %xmm3, %xmm0 179 movq %xmm0, (rp) 180 FUNC_EXIT() 181 ret 182EPILOGUE() 183