xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/gcd_22.asm (revision 867d70fc718005c0918b8b8b2f9d7f2d52d0a0db)
1dnl  AMD64 mpn_gcd_22.  Assumes useless bsf, useless shrd, no tzcnt, no shlx.
2dnl  We actually use tzcnt here, when table cannot count bits, as tzcnt always
3dnl  works for our use, and helps a lot for certain CPUs.
4
5dnl  Copyright 2019 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35
36C	     cycles/bit
37C AMD K8,K9	 8.9
38C AMD K10	 8.8
39C AMD bd1	 9.7
40C AMD bd2	 7.8
41C AMD bd3	 ?
42C AMD bd4	 7.4
43C AMD bt1	 9.2
44C AMD bt2	 9.1
45C AMD zn1	 7.5
46C AMD zn2	 7.5
47C Intel P4	 ?
48C Intel CNR	10.5
49C Intel PNR	10.5
50C Intel NHM	 9.7
51C Intel WSM	 9.7
52C Intel SBR	10.7
53C Intel IBR	 ?
54C Intel HWL	 9.5
55C Intel BWL	 8.7
56C Intel SKL	 8.6
57C Intel atom	18.9
58C Intel SLM	14.0
59C Intel GLM	 9.8
60C Intel GLM+	 8.8
61C VIA nano	 ?
62
63
64C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0.
65
66deflit(MAXSHIFT, 8)
67deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1))
68
69DEF_OBJECT(ctz_table,64)
70	.byte	MAXSHIFT
71forloop(i,1,MASK,
72`	.byte	m4_count_trailing_zeros(i)
73')
74END_OBJECT(ctz_table)
75
76define(`u1',    `%rdi')
77define(`u0',    `%rsi')
78define(`v1',    `%rdx')
79define(`v0_param', `%rcx')
80
81define(`v0',    `%rax')
82define(`cnt',   `%rcx')
83
84define(`s0',    `%r8')
85define(`s1',    `%r9')
86define(`t0',    `%rcx')
87define(`t1',    `%r11')
88
89dnl ABI_SUPPORT(DOS64)	C returns mp_double_limb_t in memory
90ABI_SUPPORT(STD64)
91
92ASM_START()
93	TEXT
94	ALIGN(64)
95PROLOGUE(mpn_gcd_22)
96	FUNC_ENTRY(4)
97	mov	v0_param, v0
98
99	LEA(	ctz_table, %r10)
100
101	ALIGN(16)
102L(top):	mov	v0, t0
103	sub	u0, t0
104	jz	L(lowz)		C	jump when low limb result = 0
105	mov	v1, t1
106	sbb	u1, t1
107
108	mov	u0, s0
109	mov	u1, s1
110
111	sub	v0, u0
112	sbb	v1, u1
113
114L(bck):	cmovc	t0, u0		C u = |u - v|
115	cmovc	t1, u1		C u = |u - v|
116	cmovc	s0, v0		C v = min(u,v)
117	cmovc	s1, v1		C v = min(u,v)
118
119	and	$MASK, R32(t0)
120	movzbl	(%r10,t0), R32(cnt)
121	jz	L(count_better)
122C Rightshift (u1,,u0) into (u1,,u0)
123L(shr):	shr	R8(cnt), u0
124	mov	u1, t1
125	shr	R8(cnt), u1
126	neg	cnt
127	shl	R8(cnt), t1
128	or	t1, u0
129
130	test	v1, v1
131	jnz	L(top)
132	test	u1, u1
133	jnz	L(top)
134
135L(gcd_11):
136	mov	v0, %rdi
137C	mov	u0, %rsi
138	TCALL(	mpn_gcd_11)
139
140L(count_better):
141	rep;bsf	u0, cnt		C tzcnt!
142	jmp	L(shr)
143
144L(lowz):C We come here when v0 - u0 = 0
145	C 1. If v1 - u1 = 0, then gcd is u = v.
146	C 2. Else compute gcd_21({v1,v0}, |u1-v1|)
147	mov	v1, t0
148	sub	u1, t0
149	je	L(end)
150
151	xor	t1, t1
152	mov	u0, s0
153	mov	u1, s1
154	mov	u1, u0
155	xor	u1, u1
156	sub	v1, u0
157	jmp	L(bck)
158
159L(end):	C mov	v0, %rax
160	C mov	v1, %rdx
161	FUNC_EXIT()
162	ret
163EPILOGUE()
164