1dnl AMD64 mpn_copyi optimised for CPUs with fast SSE copying and SSSE3. 2 3dnl Copyright 2012, 2013, 2015 Free Software Foundation, Inc. 4 5dnl Contributed to the GNU project by Torbjörn Granlund. 6 7dnl This file is part of the GNU MP Library. 8dnl 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of either: 11dnl 12dnl * the GNU Lesser General Public License as published by the Free 13dnl Software Foundation; either version 3 of the License, or (at your 14dnl option) any later version. 15dnl 16dnl or 17dnl 18dnl * the GNU General Public License as published by the Free Software 19dnl Foundation; either version 2 of the License, or (at your option) any 20dnl later version. 21dnl 22dnl or both in parallel, as here. 23dnl 24dnl The GNU MP Library is distributed in the hope that it will be useful, but 25dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27dnl for more details. 28dnl 29dnl You should have received copies of the GNU General Public License and the 30dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31dnl see https://www.gnu.org/licenses/. 32 33include(`../config.m4') 34 35C cycles/limb cycles/limb cycles/limb good 36C aligned unaligned best seen for cpu? 37C AMD K8,K9 2.0 illop 1.0/1.0 N 38C AMD K10 0.85 illop Y/N 39C AMD bd1 0.70 0.66 Y 40C AMD bd2 0.68 0.66 Y 41C AMD bd3 ? ? 42C AMD bd4 ? ? 43C AMD bt1 1.97 8.16 1.5/1.5 N 44C AMD bt2 0.77 0.93 0.65/opt N/Y 45C AMD zn1 ? ? 46C AMD zn2 ? ? 47C Intel P4 2.26 illop Y/N 48C Intel CNR 0.52 0.64 opt/opt Y 49C Intel NHM 0.52 0.71 0.50/0.67 N 50C Intel SBR 0.51 0.54 opt/0.51 Y 51C Intel IBR 0.50 0.54 opt/opt Y 52C Intel HWL 0.50 0.51 opt/opt Y 53C Intel BWL 0.55 0.55 opt/opt Y 54C Intel atom 1.16 1.61 opt/opt Y 55C Intel SLM 1.02 1.07 opt/opt Y 56C VIA nano 1.09 1.08 opt/opt Y 57 58C We use only 16-byte operations, except for unaligned top-most and bottom-most 59C limbs. We use the SSSE3 palignr instruction when rp - up = 8 (mod 16). That 60C instruction is better adapted to mpn_copyd's needs, we need to contort the 61C code to use it here. 62C 63C For operands of < COPYI_SSE_THRESHOLD limbs, we use a plain 64-bit loop, 64C taken from the x86_64 default code. 65 66C INPUT PARAMETERS 67define(`rp', `%rdi') 68define(`up', `%rsi') 69define(`n', `%rdx') 70 71C There are three instructions for loading an aligned 128-bit quantity. We use 72C movaps, since it has the shortest coding. 73dnl define(`movdqa', ``movaps'') 74 75ifdef(`COPYI_SSE_THRESHOLD',`',`define(`COPYI_SSE_THRESHOLD', 7)') 76 77ASM_START() 78 TEXT 79 ALIGN(64) 80PROLOGUE(mpn_copyi) 81 FUNC_ENTRY(3) 82 83 cmp $COPYI_SSE_THRESHOLD, n 84 jbe L(bc) 85 86 test $8, R8(rp) C is rp 16-byte aligned? 87 jz L(rp_aligned) C jump if rp aligned 88 89 movsq C copy one limb 90 dec n 91 92L(rp_aligned): 93 test $8, R8(up) 94 jnz L(uent) 95 96ifelse(eval(COPYI_SSE_THRESHOLD >= 8),1, 97` sub $8, n', 98` jmp L(am)') 99 100 ALIGN(16) 101L(atop):movdqa 0(up), %xmm0 102 movdqa 16(up), %xmm1 103 movdqa 32(up), %xmm2 104 movdqa 48(up), %xmm3 105 lea 64(up), up 106 movdqa %xmm0, (rp) 107 movdqa %xmm1, 16(rp) 108 movdqa %xmm2, 32(rp) 109 movdqa %xmm3, 48(rp) 110 lea 64(rp), rp 111L(am): sub $8, n 112 jnc L(atop) 113 114 test $4, R8(n) 115 jz 1f 116 movdqa (up), %xmm0 117 movdqa 16(up), %xmm1 118 lea 32(up), up 119 movdqa %xmm0, (rp) 120 movdqa %xmm1, 16(rp) 121 lea 32(rp), rp 122 1231: test $2, R8(n) 124 jz 1f 125 movdqa (up), %xmm0 126 lea 16(up), up 127 movdqa %xmm0, (rp) 128 lea 16(rp), rp 129 1301: test $1, R8(n) 131 jz 1f 132 mov (up), %r8 133 mov %r8, (rp) 134 1351: FUNC_EXIT() 136 ret 137 138L(uent): 139C Code handling up - rp = 8 (mod 16) 140 141 cmp $16, n 142 jc L(ued0) 143 144IFDOS(` add $-56, %rsp ') 145IFDOS(` movdqa %xmm6, (%rsp) ') 146IFDOS(` movdqa %xmm7, 16(%rsp) ') 147IFDOS(` movdqa %xmm8, 32(%rsp) ') 148 149 movaps 120(up), %xmm7 150 movaps 104(up), %xmm6 151 movaps 88(up), %xmm5 152 movaps 72(up), %xmm4 153 movaps 56(up), %xmm3 154 movaps 40(up), %xmm2 155 lea 128(up), up 156 sub $32, n 157 jc L(ued1) 158 159 ALIGN(16) 160L(utop):movaps -104(up), %xmm1 161 sub $16, n 162 movaps -120(up), %xmm0 163 palignr($8, %xmm6, %xmm7) 164 movaps -136(up), %xmm8 165 movdqa %xmm7, 112(rp) 166 palignr($8, %xmm5, %xmm6) 167 movaps 120(up), %xmm7 168 movdqa %xmm6, 96(rp) 169 palignr($8, %xmm4, %xmm5) 170 movaps 104(up), %xmm6 171 movdqa %xmm5, 80(rp) 172 palignr($8, %xmm3, %xmm4) 173 movaps 88(up), %xmm5 174 movdqa %xmm4, 64(rp) 175 palignr($8, %xmm2, %xmm3) 176 movaps 72(up), %xmm4 177 movdqa %xmm3, 48(rp) 178 palignr($8, %xmm1, %xmm2) 179 movaps 56(up), %xmm3 180 movdqa %xmm2, 32(rp) 181 palignr($8, %xmm0, %xmm1) 182 movaps 40(up), %xmm2 183 movdqa %xmm1, 16(rp) 184 palignr($8, %xmm8, %xmm0) 185 lea 128(up), up 186 movdqa %xmm0, (rp) 187 lea 128(rp), rp 188 jnc L(utop) 189 190L(ued1):movaps -104(up), %xmm1 191 movaps -120(up), %xmm0 192 movaps -136(up), %xmm8 193 palignr($8, %xmm6, %xmm7) 194 movdqa %xmm7, 112(rp) 195 palignr($8, %xmm5, %xmm6) 196 movdqa %xmm6, 96(rp) 197 palignr($8, %xmm4, %xmm5) 198 movdqa %xmm5, 80(rp) 199 palignr($8, %xmm3, %xmm4) 200 movdqa %xmm4, 64(rp) 201 palignr($8, %xmm2, %xmm3) 202 movdqa %xmm3, 48(rp) 203 palignr($8, %xmm1, %xmm2) 204 movdqa %xmm2, 32(rp) 205 palignr($8, %xmm0, %xmm1) 206 movdqa %xmm1, 16(rp) 207 palignr($8, %xmm8, %xmm0) 208 movdqa %xmm0, (rp) 209 lea 128(rp), rp 210 211IFDOS(` movdqa (%rsp), %xmm6 ') 212IFDOS(` movdqa 16(%rsp), %xmm7 ') 213IFDOS(` movdqa 32(%rsp), %xmm8 ') 214IFDOS(` add $56, %rsp ') 215 216L(ued0):test $8, R8(n) 217 jz 1f 218 movaps 56(up), %xmm3 219 movaps 40(up), %xmm2 220 movaps 24(up), %xmm1 221 movaps 8(up), %xmm0 222 movaps -8(up), %xmm4 223 palignr($8, %xmm2, %xmm3) 224 movdqa %xmm3, 48(rp) 225 palignr($8, %xmm1, %xmm2) 226 movdqa %xmm2, 32(rp) 227 palignr($8, %xmm0, %xmm1) 228 movdqa %xmm1, 16(rp) 229 palignr($8, %xmm4, %xmm0) 230 lea 64(up), up 231 movdqa %xmm0, (rp) 232 lea 64(rp), rp 233 2341: test $4, R8(n) 235 jz 1f 236 movaps 24(up), %xmm1 237 movaps 8(up), %xmm0 238 palignr($8, %xmm0, %xmm1) 239 movaps -8(up), %xmm3 240 movdqa %xmm1, 16(rp) 241 palignr($8, %xmm3, %xmm0) 242 lea 32(up), up 243 movdqa %xmm0, (rp) 244 lea 32(rp), rp 245 2461: test $2, R8(n) 247 jz 1f 248 movdqa 8(up), %xmm0 249 movdqa -8(up), %xmm3 250 palignr($8, %xmm3, %xmm0) 251 lea 16(up), up 252 movdqa %xmm0, (rp) 253 lea 16(rp), rp 254 2551: test $1, R8(n) 256 jz 1f 257 mov (up), %r8 258 mov %r8, (rp) 259 2601: FUNC_EXIT() 261 ret 262 263C Basecase code. Needed for good small operands speed, not for 264C correctness as the above code is currently written. 265 266L(bc): lea -8(rp), rp 267 sub $4, R32(n) 268 jc L(end) 269 270 ALIGN(16) 271L(top): mov (up), %r8 272 mov 8(up), %r9 273 lea 32(rp), rp 274 mov 16(up), %r10 275 mov 24(up), %r11 276 lea 32(up), up 277 mov %r8, -24(rp) 278 mov %r9, -16(rp) 279ifelse(eval(COPYI_SSE_THRESHOLD >= 8),1, 280` sub $4, R32(n)') 281 mov %r10, -8(rp) 282 mov %r11, (rp) 283ifelse(eval(COPYI_SSE_THRESHOLD >= 8),1, 284` jnc L(top)') 285 286L(end): test $1, R8(n) 287 jz 1f 288 mov (up), %r8 289 mov %r8, 8(rp) 290 lea 8(rp), rp 291 lea 8(up), up 2921: test $2, R8(n) 293 jz 1f 294 mov (up), %r8 295 mov 8(up), %r9 296 mov %r8, 8(rp) 297 mov %r9, 16(rp) 2981: FUNC_EXIT() 299 ret 300EPILOGUE() 301