1dnl AMD64 mpn_rshift optimised for CPUs with fast SSE including fast movdqu. 2 3dnl Contributed to the GNU project by Torbjorn Granlund. 4 5dnl Copyright 2010, 2011, 2012 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of the GNU Lesser General Public License as published 11dnl by the Free Software Foundation; either version 3 of the License, or (at 12dnl your option) any later version. 13 14dnl The GNU MP Library is distributed in the hope that it will be useful, but 15dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 16dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 17dnl License for more details. 18 19dnl You should have received a copy of the GNU Lesser General Public License 20dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 21 22include(`../config.m4') 23 24 25C cycles/limb cycles/limb cycles/limb good 26C aligned unaligned best seen for cpu? 27C AMD K8,K9 3 3 2.35 no, use shl/shr 28C AMD K10 1.5-1.8 1.5-1.8 1.33 yes 29C AMD bd1 1.7-1.9 1.7-1.9 1.33 yes 30C AMD bobcat 3.17 3.17 yes, bad for n < 20 31C Intel P4 4.67 4.67 2.7 no, slow movdqu 32C Intel core2 2.15 2.15 1.25 no, use shld/shrd 33C Intel NHM 1.66 1.66 1.25 no, use shld/shrd 34C Intel SBR 1.3 1.3 1.25 yes, bad for n = 4-6 35C Intel atom 11.7 11.7 4.5 no 36C VIA nano 5.7 5.95 2.0 no, slow movdqu 37 38C We try to do as many aligned 16-byte operations as possible. The top-most 39C and bottom-most writes might need 8-byte operations. 40C 41C This variant rely on fast load movdqu, and uses it even for aligned operands, 42C in order to avoid the need for two separate loops. 43C 44C TODO 45C * Could 2-limb wind-down code be simplified? 46C * Improve basecase code, using shld/shrd for SBR, discrete integer shifts 47C for other affected CPUs. 48 49C INPUT PARAMETERS 50define(`rp', `%rdi') 51define(`ap', `%rsi') 52define(`n', `%rdx') 53define(`cnt', `%rcx') 54 55ASM_START() 56 TEXT 57 ALIGN(64) 58PROLOGUE(mpn_rshift) 59 FUNC_ENTRY(4) 60 movd R32(%rcx), %xmm4 61 mov $64, R32(%rax) 62 sub R32(%rcx), R32(%rax) 63 movd R32(%rax), %xmm5 64 65 neg R32(%rcx) 66 mov (ap), %rax 67 shl R8(%rcx), %rax 68 69 cmp $3, n 70 jle L(bc) 71 72 bt $3, R32(rp) 73 jnc L(rp_aligned) 74 75C Do one initial limb in order to make rp aligned 76 movq (ap), %xmm0 77 movq 8(ap), %xmm1 78 psrlq %xmm4, %xmm0 79 psllq %xmm5, %xmm1 80 por %xmm1, %xmm0 81 movq %xmm0, (rp) 82 lea 8(ap), ap 83 lea 8(rp), rp 84 dec n 85 86L(rp_aligned): 87 lea 1(n), %r8d 88 lea (ap,n,8), ap 89 lea (rp,n,8), rp 90 neg n 91 92 and $6, R32(%r8) 93 jz L(bu0) 94 cmp $4, R32(%r8) 95 jz L(bu4) 96 jc L(bu2) 97L(bu6): add $4, n 98 jmp L(i56) 99L(bu0): add $6, n 100 jmp L(i70) 101L(bu4): add $2, n 102 jmp L(i34) 103L(bu2): add $8, n 104 jge L(end) 105 106 ALIGN(16) 107L(top): movdqu -64(ap,n,8), %xmm1 108 movdqu -56(ap,n,8), %xmm0 109 psllq %xmm5, %xmm0 110 psrlq %xmm4, %xmm1 111 por %xmm1, %xmm0 112 movdqa %xmm0, -64(rp,n,8) 113L(i70): 114 movdqu -48(ap,n,8), %xmm1 115 movdqu -40(ap,n,8), %xmm0 116 psllq %xmm5, %xmm0 117 psrlq %xmm4, %xmm1 118 por %xmm1, %xmm0 119 movdqa %xmm0, -48(rp,n,8) 120L(i56): 121 movdqu -32(ap,n,8), %xmm1 122 movdqu -24(ap,n,8), %xmm0 123 psllq %xmm5, %xmm0 124 psrlq %xmm4, %xmm1 125 por %xmm1, %xmm0 126 movdqa %xmm0, -32(rp,n,8) 127L(i34): 128 movdqu -16(ap,n,8), %xmm1 129 movdqu -8(ap,n,8), %xmm0 130 psllq %xmm5, %xmm0 131 psrlq %xmm4, %xmm1 132 por %xmm1, %xmm0 133 movdqa %xmm0, -16(rp,n,8) 134 add $8, n 135 jl L(top) 136 137L(end): bt $0, R32(n) 138 jc L(e1) 139 140 movdqu -16(ap), %xmm1 141 movq -8(ap), %xmm0 142 psrlq %xmm4, %xmm1 143 psllq %xmm5, %xmm0 144 por %xmm1, %xmm0 145 movdqa %xmm0, -16(rp) 146 FUNC_EXIT() 147 ret 148 149L(e1): movq -8(ap), %xmm0 150 psrlq %xmm4, %xmm0 151 movq %xmm0, -8(rp) 152 FUNC_EXIT() 153 ret 154 155C Basecase 156 ALIGN(16) 157L(bc): dec R32(n) 158 jnz 1f 159 movq (ap), %xmm0 160 psrlq %xmm4, %xmm0 161 movq %xmm0, (rp) 162 FUNC_EXIT() 163 ret 164 1651: movq (ap), %xmm1 166 movq 8(ap), %xmm0 167 psrlq %xmm4, %xmm1 168 psllq %xmm5, %xmm0 169 por %xmm1, %xmm0 170 movq %xmm0, (rp) 171 dec R32(n) 172 jnz 1f 173 movq 8(ap), %xmm0 174 psrlq %xmm4, %xmm0 175 movq %xmm0, 8(rp) 176 FUNC_EXIT() 177 ret 178 1791: movq 8(ap), %xmm1 180 movq 16(ap), %xmm0 181 psrlq %xmm4, %xmm1 182 psllq %xmm5, %xmm0 183 por %xmm1, %xmm0 184 movq %xmm0, 8(rp) 185 movq 16(ap), %xmm0 186 psrlq %xmm4, %xmm0 187 movq %xmm0, 16(rp) 188 FUNC_EXIT() 189 ret 190EPILOGUE() 191