1dnl AMD64 mpn_gcd_11 -- 1 x 1 gcd. 2 3dnl Based on the K7 gcd_1.asm, by Kevin Ryde. Rehacked for AMD64 by Torbjorn 4dnl Granlund. 5 6dnl Copyright 2000-2002, 2005, 2009, 2011, 2012, 2017 Free Software 7dnl Foundation, Inc. 8 9dnl This file is part of the GNU MP Library. 10dnl 11dnl The GNU MP Library is free software; you can redistribute it and/or modify 12dnl it under the terms of either: 13dnl 14dnl * the GNU Lesser General Public License as published by the Free 15dnl Software Foundation; either version 3 of the License, or (at your 16dnl option) any later version. 17dnl 18dnl or 19dnl 20dnl * the GNU General Public License as published by the Free Software 21dnl Foundation; either version 2 of the License, or (at your option) any 22dnl later version. 23dnl 24dnl or both in parallel, as here. 25dnl 26dnl The GNU MP Library is distributed in the hope that it will be useful, but 27dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 28dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 29dnl for more details. 30dnl 31dnl You should have received copies of the GNU General Public License and the 32dnl GNU Lesser General Public License along with the GNU MP Library. If not, 33dnl see https://www.gnu.org/licenses/. 34 35include(`../config.m4') 36 37 38C cycles/bit 39C AMD K8,K9 ? 40C AMD K10 ? 41C AMD bd1 ? 42C AMD bd2 ? 43C AMD bd3 ? 44C AMD bd4 ? 45C AMD bt1 5.4 46C AMD bt2 ? 47C AMD zn1 ? 48C AMD zn2 ? 49C Intel P4 ? 50C Intel CNR ? 51C Intel PNR ? 52C Intel NHM ? 53C Intel WSM ? 54C Intel SBR ? 55C Intel IBR ? 56C Intel HWL ? 57C Intel BWL ? 58C Intel SKL ? 59C Intel atom ? 60C Intel SLM ? 61C Intel GLM ? 62C Intel GLM+ ? 63C VIA nano ? 64 65 66C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0. 67 68deflit(MAXSHIFT, 8) 69deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1)) 70 71DEF_OBJECT(ctz_table,64) 72 .byte MAXSHIFT 73forloop(i,1,MASK, 74` .byte m4_count_trailing_zeros(i) 75') 76END_OBJECT(ctz_table) 77 78define(`u0', `%rdi') 79define(`v0', `%rsi') 80 81define(`cnt', `%rcx') 82define(`s0', `%rax') 83define(`t0', `%rdx') 84 85ABI_SUPPORT(DOS64) 86ABI_SUPPORT(STD64) 87 88ASM_START() 89 TEXT 90 ALIGN(64) 91PROLOGUE(mpn_gcd_11) 92 FUNC_ENTRY(2) 93 LEA( ctz_table, %r10) 94 mov v0, t0 95 sub u0, t0 96 jz L(end) 97 98 ALIGN(16) 99L(top): mov u0, s0 100 sub v0, u0 101 cmovc t0, u0 C u = |u - v| 102 cmovc s0, v0 C v = min(u,v) 103 and $MASK, R32(t0) 104 movzbl (%r10,t0), R32(cnt) 105 jz L(count_better) 106L(shr): shr R8(cnt), u0 107 mov v0, t0 108 sub u0, t0 109 jnz L(top) 110 111L(end): mov v0, %rax 112 C rdx = 0 for the benefit of internal gcd_22 call 113 FUNC_EXIT() 114 ret 115 116L(count_better): 117 bsf u0, cnt 118 jmp L(shr) 119EPILOGUE() 120