xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/fastsse/rshift-movdqu2.asm (revision 48fb7bfab72acd4281a53bbee5ccf3f809019e75)
1dnl  AMD64 mpn_rshift optimised for CPUs with fast SSE including fast movdqu.
2
3dnl  Contributed to the GNU project by Torbjorn Granlund.
4
5dnl  Copyright 2010, 2011, 2012 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of the GNU Lesser General Public License as published
11dnl  by the Free Software Foundation; either version 3 of the License, or (at
12dnl  your option) any later version.
13
14dnl  The GNU MP Library is distributed in the hope that it will be useful, but
15dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
17dnl  License for more details.
18
19dnl  You should have received a copy of the GNU Lesser General Public License
20dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
21
22include(`../config.m4')
23
24
25C	     cycles/limb     cycles/limb     cycles/limb    good
26C              aligned	      unaligned	      best seen	   for cpu?
27C AMD K8,K9	 3		 3		 2.35	  no, use shl/shr
28C AMD K10	 1.5-1.8	 1.5-1.8	 1.33	  yes
29C AMD bd1	 1.7-1.9	 1.7-1.9	 1.33	  yes
30C AMD bobcat	 3.17		 3.17			  yes, bad for n < 20
31C Intel P4	 4.67		 4.67		 2.7	  no, slow movdqu
32C Intel core2	 2.15		 2.15		 1.25	  no, use shld/shrd
33C Intel NHM	 1.66		 1.66		 1.25	  no, use shld/shrd
34C Intel SBR	 1.3		 1.3		 1.25	  yes, bad for n = 4-6
35C Intel atom	11.7		11.7		 4.5	  no
36C VIA nano	 5.7		 5.95		 2.0	  no, slow movdqu
37
38C We try to do as many aligned 16-byte operations as possible.  The top-most
39C and bottom-most writes might need 8-byte operations.
40C
41C This variant rely on fast load movdqu, and uses it even for aligned operands,
42C in order to avoid the need for two separate loops.
43C
44C TODO
45C  * Could 2-limb wind-down code be simplified?
46C  * Improve basecase code, using shld/shrd for SBR, discrete integer shifts
47C    for other affected CPUs.
48
49C INPUT PARAMETERS
50define(`rp',  `%rdi')
51define(`ap',  `%rsi')
52define(`n',   `%rdx')
53define(`cnt', `%rcx')
54
55ASM_START()
56	TEXT
57	ALIGN(64)
58PROLOGUE(mpn_rshift)
59	FUNC_ENTRY(4)
60	movd	R32(%rcx), %xmm4
61	mov	$64, R32(%rax)
62	sub	R32(%rcx), R32(%rax)
63	movd	R32(%rax), %xmm5
64
65	neg	R32(%rcx)
66	mov	(ap), %rax
67	shl	R8(%rcx), %rax
68
69	cmp	$3, n
70	jle	L(bc)
71
72	bt	$3, R32(rp)
73	jnc	L(rp_aligned)
74
75C Do one initial limb in order to make rp aligned
76	movq	(ap), %xmm0
77	movq	8(ap), %xmm1
78	psrlq	%xmm4, %xmm0
79	psllq	%xmm5, %xmm1
80	por	%xmm1, %xmm0
81	movq	%xmm0, (rp)
82	lea	8(ap), ap
83	lea	8(rp), rp
84	dec	n
85
86L(rp_aligned):
87	lea	1(n), %r8d
88	lea	(ap,n,8), ap
89	lea	(rp,n,8), rp
90	neg	n
91
92	and	$6, R32(%r8)
93	jz	L(bu0)
94	cmp	$4, R32(%r8)
95	jz	L(bu4)
96	jc	L(bu2)
97L(bu6):	add	$4, n
98	jmp	L(i56)
99L(bu0):	add	$6, n
100	jmp	L(i70)
101L(bu4):	add	$2, n
102	jmp	L(i34)
103L(bu2):	add	$8, n
104	jge	L(end)
105
106	ALIGN(16)
107L(top):	movdqu	-64(ap,n,8), %xmm1
108	movdqu	-56(ap,n,8), %xmm0
109	psllq	%xmm5, %xmm0
110	psrlq	%xmm4, %xmm1
111	por	%xmm1, %xmm0
112	movdqa	%xmm0, -64(rp,n,8)
113L(i70):
114	movdqu	-48(ap,n,8), %xmm1
115	movdqu	-40(ap,n,8), %xmm0
116	psllq	%xmm5, %xmm0
117	psrlq	%xmm4, %xmm1
118	por	%xmm1, %xmm0
119	movdqa	%xmm0, -48(rp,n,8)
120L(i56):
121	movdqu	-32(ap,n,8), %xmm1
122	movdqu	-24(ap,n,8), %xmm0
123	psllq	%xmm5, %xmm0
124	psrlq	%xmm4, %xmm1
125	por	%xmm1, %xmm0
126	movdqa	%xmm0, -32(rp,n,8)
127L(i34):
128	movdqu	-16(ap,n,8), %xmm1
129	movdqu	-8(ap,n,8), %xmm0
130	psllq	%xmm5, %xmm0
131	psrlq	%xmm4, %xmm1
132	por	%xmm1, %xmm0
133	movdqa	%xmm0, -16(rp,n,8)
134	add	$8, n
135	jl	L(top)
136
137L(end):	bt	$0, R32(n)
138	jc	L(e1)
139
140	movdqu	-16(ap), %xmm1
141	movq	-8(ap), %xmm0
142	psrlq	%xmm4, %xmm1
143	psllq	%xmm5, %xmm0
144	por	%xmm1, %xmm0
145	movdqa	%xmm0, -16(rp)
146	FUNC_EXIT()
147	ret
148
149L(e1):	movq	-8(ap), %xmm0
150	psrlq	%xmm4, %xmm0
151	movq	%xmm0, -8(rp)
152	FUNC_EXIT()
153	ret
154
155C Basecase
156	ALIGN(16)
157L(bc):	dec	R32(n)
158	jnz	1f
159	movq	(ap), %xmm0
160	psrlq	%xmm4, %xmm0
161	movq	%xmm0, (rp)
162	FUNC_EXIT()
163	ret
164
1651:	movq	(ap), %xmm1
166	movq	8(ap), %xmm0
167	psrlq	%xmm4, %xmm1
168	psllq	%xmm5, %xmm0
169	por	%xmm1, %xmm0
170	movq	%xmm0, (rp)
171	dec	R32(n)
172	jnz	1f
173	movq	8(ap), %xmm0
174	psrlq	%xmm4, %xmm0
175	movq	%xmm0, 8(rp)
176	FUNC_EXIT()
177	ret
178
1791:	movq	8(ap), %xmm1
180	movq	16(ap), %xmm0
181	psrlq	%xmm4, %xmm1
182	psllq	%xmm5, %xmm0
183	por	%xmm1, %xmm0
184	movq	%xmm0,	8(rp)
185	movq	16(ap), %xmm0
186	psrlq	%xmm4, %xmm0
187	movq	%xmm0, 16(rp)
188	FUNC_EXIT()
189	ret
190EPILOGUE()
191