1dnl AMD64 mpn_gcd_22. Assumes useless bsf, useless shrd, no tzcnt, no shlx. 2dnl We actually use tzcnt here, when table cannot count bits, as tzcnt always 3dnl works for our use, and helps a lot for certain CPUs. 4 5dnl Copyright 2019 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8dnl 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of either: 11dnl 12dnl * the GNU Lesser General Public License as published by the Free 13dnl Software Foundation; either version 3 of the License, or (at your 14dnl option) any later version. 15dnl 16dnl or 17dnl 18dnl * the GNU General Public License as published by the Free Software 19dnl Foundation; either version 2 of the License, or (at your option) any 20dnl later version. 21dnl 22dnl or both in parallel, as here. 23dnl 24dnl The GNU MP Library is distributed in the hope that it will be useful, but 25dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27dnl for more details. 28dnl 29dnl You should have received copies of the GNU General Public License and the 30dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31dnl see https://www.gnu.org/licenses/. 32 33include(`../config.m4') 34 35 36C cycles/bit 37C AMD K8,K9 8.9 38C AMD K10 8.8 39C AMD bd1 9.7 40C AMD bd2 7.8 41C AMD bd3 ? 42C AMD bd4 7.4 43C AMD bt1 9.2 44C AMD bt2 9.1 45C AMD zn1 7.5 46C AMD zn2 7.5 47C Intel P4 ? 48C Intel CNR 10.5 49C Intel PNR 10.5 50C Intel NHM 9.7 51C Intel WSM 9.7 52C Intel SBR 10.7 53C Intel IBR ? 54C Intel HWL 9.5 55C Intel BWL 8.7 56C Intel SKL 8.6 57C Intel atom 18.9 58C Intel SLM 14.0 59C Intel GLM 9.8 60C Intel GLM+ 8.8 61C VIA nano ? 62 63 64C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0. 65 66deflit(MAXSHIFT, 8) 67deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1)) 68 69DEF_OBJECT(ctz_table,64) 70 .byte MAXSHIFT 71forloop(i,1,MASK, 72` .byte m4_count_trailing_zeros(i) 73') 74END_OBJECT(ctz_table) 75 76define(`u1', `%rdi') 77define(`u0', `%rsi') 78define(`v1', `%rdx') 79define(`v0_param', `%rcx') 80 81define(`v0', `%rax') 82define(`cnt', `%rcx') 83 84define(`s0', `%r8') 85define(`s1', `%r9') 86define(`t0', `%rcx') 87define(`t1', `%r11') 88 89dnl ABI_SUPPORT(DOS64) C returns mp_double_limb_t in memory 90ABI_SUPPORT(STD64) 91 92ASM_START() 93 TEXT 94 ALIGN(64) 95PROLOGUE(mpn_gcd_22) 96 FUNC_ENTRY(4) 97 mov v0_param, v0 98 99 LEA( ctz_table, %r10) 100 101 ALIGN(16) 102L(top): mov v0, t0 103 sub u0, t0 104 jz L(lowz) C jump when low limb result = 0 105 mov v1, t1 106 sbb u1, t1 107 108 mov u0, s0 109 mov u1, s1 110 111 sub v0, u0 112 sbb v1, u1 113 114L(bck): cmovc t0, u0 C u = |u - v| 115 cmovc t1, u1 C u = |u - v| 116 cmovc s0, v0 C v = min(u,v) 117 cmovc s1, v1 C v = min(u,v) 118 119 and $MASK, R32(t0) 120 movzbl (%r10,t0), R32(cnt) 121 jz L(count_better) 122C Rightshift (u1,,u0) into (u1,,u0) 123L(shr): shr R8(cnt), u0 124 mov u1, t1 125 shr R8(cnt), u1 126 neg cnt 127 shl R8(cnt), t1 128 or t1, u0 129 130 test v1, v1 131 jnz L(top) 132 test u1, u1 133 jnz L(top) 134 135L(gcd_11): 136 mov v0, %rdi 137C mov u0, %rsi 138 TCALL( mpn_gcd_11) 139 140L(count_better): 141 rep;bsf u0, cnt C tzcnt! 142 jmp L(shr) 143 144L(lowz):C We come here when v0 - u0 = 0 145 C 1. If v1 - u1 = 0, then gcd is u = v. 146 C 2. Else compute gcd_21({v1,v0}, |u1-v1|) 147 mov v1, t0 148 sub u1, t0 149 je L(end) 150 151 xor t1, t1 152 mov u0, s0 153 mov u1, s1 154 mov u1, u0 155 xor u1, u1 156 sub v1, u0 157 jmp L(bck) 158 159L(end): C mov v0, %rax 160 C mov v1, %rdx 161 FUNC_EXIT() 162 ret 163EPILOGUE() 164