1dnl x86 mpn_gcd_11 optimised for processors with slow BSF. 2 3dnl Based on C version. 4 5dnl Copyright 2019 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8dnl 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of either: 11dnl 12dnl * the GNU Lesser General Public License as published by the Free 13dnl Software Foundation; either version 3 of the License, or (at your 14dnl option) any later version. 15dnl 16dnl or 17dnl 18dnl * the GNU General Public License as published by the Free Software 19dnl Foundation; either version 2 of the License, or (at your option) any 20dnl later version. 21dnl 22dnl or both in parallel, as here. 23dnl 24dnl The GNU MP Library is distributed in the hope that it will be useful, but 25dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27dnl for more details. 28dnl 29dnl You should have received copies of the GNU General Public License and the 30dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31dnl see https://www.gnu.org/licenses/. 32 33include(`../config.m4') 34 35dnl Rudimentary code for x86-32, i.e. for CPUs without cmov. Also, the bsf 36dnl instruction is assumed to be so slow it is useless. Instead a teble is 37dnl used. 38dnl 39dnl The loop benefits from OoO, in-order CPUs might want a different loop. 40dnl The ebx and ecx registers could be combined if the assigment of ecx were 41dnl postponed until ebx died, but that would at least hurt in-order CPUs. 42 43C cycles/bit (approx) 44C AMD K7 ? 45C AMD K8,K9 ? 46C AMD K10 ? 47C AMD bd1 ? 48C AMD bd2 ? 49C AMD bd3 ? 50C AMD bd4 ? 51C AMD bt1 ? 52C AMD bt2 ? 53C AMD zn1 ? 54C AMD zn2 ? 55C Intel P4-2 ? 56C Intel P4-3/4 ? 57C Intel P6/13 ? 58C Intel CNR ? 59C Intel NHM ? 60C Intel SBR ? 61C Intel IBR ? 62C Intel HWL ? 63C Intel BWL ? 64C Intel SKL ? 65C Intel atom ? 66C Intel SLM ? 67C Intel GLM ? 68C Intel GLM+ ? 69C VIA nano ? 70C Numbers measured with: speed -CD -s8-32 -t24 mpn_gcd_1 71 72deflit(MAXSHIFT, 6) 73deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1)) 74 75DEF_OBJECT(ctz_table,64) 76 .byte MAXSHIFT 77forloop(i,1,MASK, 78` .byte m4_count_trailing_zeros(i) 79') 80END_OBJECT(ctz_table) 81 82define(`u0', `%eax') 83define(`v0', `%edx') 84 85 86ASM_START() 87 TEXT 88 ALIGN(16) 89PROLOGUE(mpn_gcd_11) 90 push %edi 91 push %esi 92 push %ebx 93 94 mov 16(%esp), u0 95 mov 20(%esp), v0 96 LEAL( ctz_table, %esi) 97 sub v0, u0 C u = u - v 0 98 jz L(end) 99 100 ALIGN(16) 101L(top): sbb %ebx, %ebx C mask 1 102 mov u0, %edi C 1 103 mov u0, %ecx C 1 104 and %ebx, %edi C 2 105 xor %ebx, u0 C 2 106 add %edi, v0 C v = min(u.v) 3 107 sub %ebx, u0 C u = |u - v| 3 108L(mid): and $MASK, %ecx C 2 109 movzbl (%esi,%ecx), %ecx C 3 110 jz L(shift_alot) 111 shr %cl, u0 C 4 112 sub v0, u0 C u = u - v 0,5 113 jnz L(top) 114 115L(end): mov v0, %eax 116 pop %ebx 117 pop %esi 118 pop %edi 119 ret 120 121L(shift_alot): 122 shr $MAXSHIFT, u0 123 mov u0, %ecx 124 jmp L(mid) 125EPILOGUE() 126ASM_END() 127