xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/fastsse/lshift-movdqu2.asm (revision 63aea4bd5b445e491ff0389fe27ec78b3099dba3)
1dnl  AMD64 mpn_lshift optimised for CPUs with fast SSE including fast movdqu.
2
3dnl  Contributed to the GNU project by Torbjorn Granlund.
4
5dnl  Copyright 2010, 2011, 2012 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of the GNU Lesser General Public License as published
11dnl  by the Free Software Foundation; either version 3 of the License, or (at
12dnl  your option) any later version.
13
14dnl  The GNU MP Library is distributed in the hope that it will be useful, but
15dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
17dnl  License for more details.
18
19dnl  You should have received a copy of the GNU Lesser General Public License
20dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
21
22include(`../config.m4')
23
24
25C	     cycles/limb     cycles/limb     cycles/limb    good
26C              aligned	      unaligned	      best seen	   for cpu?
27C AMD K8,K9	 3		 3		 2.35	  no, use shl/shr
28C AMD K10	 1.5-1.8	 1.5-1.8	 1.33	  yes
29C AMD bd1	 1.7-1.9	 1.7-1.9	 1.33	  yes
30C AMD bobcat	 3.17		 3.17			  yes, bad for n < 20
31C Intel P4	 4.67		 4.67		 2.7	  no, slow movdqu
32C Intel core2	 2.15		 2.15		 1.25	  no, use shld/shrd
33C Intel NHM	 1.66		 1.66		 1.25	  no, use shld/shrd
34C Intel SBR	 1.3		 1.3		 1.25	  yes, bad for n = 4-6
35C Intel atom	11.7		11.7		 4.5	  no
36C VIA nano	 5.7		 5.95		 2.0	  no, slow movdqu
37
38C We try to do as many aligned 16-byte operations as possible.  The top-most
39C and bottom-most writes might need 8-byte operations.
40C
41C This variant rely on fast load movdqu, and uses it even for aligned operands,
42C in order to avoid the need for two separate loops.
43C
44C TODO
45C  * Could 2-limb wind-down code be simplified?
46C  * Improve basecase code, using shld/shrd for SBR, discrete integer shifts
47C    for other affected CPUs.
48
49C INPUT PARAMETERS
50define(`rp',  `%rdi')
51define(`ap',  `%rsi')
52define(`n',   `%rdx')
53define(`cnt', `%rcx')
54
55ASM_START()
56	TEXT
57	ALIGN(64)
58PROLOGUE(mpn_lshift)
59	FUNC_ENTRY(4)
60	movd	R32(%rcx), %xmm4
61	mov	$64, R32(%rax)
62	sub	R32(%rcx), R32(%rax)
63	movd	R32(%rax), %xmm5
64
65	neg	R32(%rcx)
66	mov	-8(ap,n,8), %rax
67	shr	R8(%rcx), %rax
68
69	cmp	$3, n
70	jle	L(bc)
71
72	lea	(rp,n,8), R32(%rcx)
73	bt	$3, R32(%rcx)
74	jnc	L(rp_aligned)
75
76C Do one initial limb in order to make rp aligned
77	movq	-8(ap,n,8), %xmm0
78	movq	-16(ap,n,8), %xmm1
79	psllq	%xmm4, %xmm0
80	psrlq	%xmm5, %xmm1
81	por	%xmm1, %xmm0
82	movq	%xmm0, -8(rp,n,8)
83	dec	n
84
85L(rp_aligned):
86	lea	1(n), %r8d
87
88	and	$6, R32(%r8)
89	jz	L(ba0)
90	cmp	$4, R32(%r8)
91	jz	L(ba4)
92	jc	L(ba2)
93L(ba6):	add	$-4, n
94	jmp	L(i56)
95L(ba0):	add	$-6, n
96	jmp	L(i70)
97L(ba4):	add	$-2, n
98	jmp	L(i34)
99L(ba2):	add	$-8, n
100	jle	L(end)
101
102	ALIGN(16)
103L(top):	movdqu	40(ap,n,8), %xmm1
104	movdqu	48(ap,n,8), %xmm0
105	psllq	%xmm4, %xmm0
106	psrlq	%xmm5, %xmm1
107	por	%xmm1, %xmm0
108	movdqa	%xmm0, 48(rp,n,8)
109L(i70):
110	movdqu	24(ap,n,8), %xmm1
111	movdqu	32(ap,n,8), %xmm0
112	psllq	%xmm4, %xmm0
113	psrlq	%xmm5, %xmm1
114	por	%xmm1, %xmm0
115	movdqa	%xmm0, 32(rp,n,8)
116L(i56):
117	movdqu	8(ap,n,8), %xmm1
118	movdqu	16(ap,n,8), %xmm0
119	psllq	%xmm4, %xmm0
120	psrlq	%xmm5, %xmm1
121	por	%xmm1, %xmm0
122	movdqa	%xmm0, 16(rp,n,8)
123L(i34):
124	movdqu	-8(ap,n,8), %xmm1
125	movdqu	(ap,n,8), %xmm0
126	psllq	%xmm4, %xmm0
127	psrlq	%xmm5, %xmm1
128	por	%xmm1, %xmm0
129	movdqa	%xmm0, (rp,n,8)
130	sub	$8, n
131	jg	L(top)
132
133L(end):	bt	$0, R32(n)
134	jc	L(end8)
135
136	movdqu	(ap), %xmm1
137	pxor	%xmm0, %xmm0
138	punpcklqdq  %xmm1, %xmm0
139	psllq	%xmm4, %xmm1
140	psrlq	%xmm5, %xmm0
141	por	%xmm1, %xmm0
142	movdqa	%xmm0, (rp)
143	FUNC_EXIT()
144	ret
145
146C Basecase
147	ALIGN(16)
148L(bc):	dec	R32(n)
149	jz	L(end8)
150
151	movq	(ap,n,8), %xmm1
152	movq	-8(ap,n,8), %xmm0
153	psllq	%xmm4, %xmm1
154	psrlq	%xmm5, %xmm0
155	por	%xmm1, %xmm0
156	movq	%xmm0, (rp,n,8)
157	sub	$2, R32(n)
158	jl	L(end8)
159	movq	8(ap), %xmm1
160	movq	(ap), %xmm0
161	psllq	%xmm4, %xmm1
162	psrlq	%xmm5, %xmm0
163	por	%xmm1, %xmm0
164	movq	%xmm0, 8(rp)
165
166L(end8):movq	(ap), %xmm0
167	psllq	%xmm4, %xmm0
168	movq	%xmm0, (rp)
169	FUNC_EXIT()
170	ret
171EPILOGUE()
172