xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/fastsse/lshiftc-movdqu2.asm (revision 87d689fb734c654d2486f87f7be32f1b53ecdbec)
1dnl  AMD64 mpn_lshiftc optimised for CPUs with fast SSE including fast movdqu.
2
3dnl  Contributed to the GNU project by Torbjorn Granlund.
4
5dnl  Copyright 2010-2012 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35
36C	     cycles/limb     cycles/limb     cycles/limb    good
37C              aligned	      unaligned	      best seen	   for cpu?
38C AMD K8,K9	 3		 3		 ?	  no, use shl/shr
39C AMD K10	 1.8-2.0	 1.8-2.0	 ?	  yes
40C AMD bd1	 1.9		 1.9		 ?	  yes
41C AMD bobcat	 3.67		 3.67			  yes, bad for n < 20
42C Intel P4	 4.75		 4.75		 ?	  no, slow movdqu
43C Intel core2	 2.27		 2.27		 ?	  no, use shld/shrd
44C Intel NHM	 2.15		 2.15		 ?	  no, use shld/shrd
45C Intel SBR	 1.45		 1.45		 ?	  yes, bad for n = 4-6
46C Intel atom	12.9		12.9		 ?	  no
47C VIA nano	 6.18		 6.44		 ?	  no, slow movdqu
48
49C We try to do as many aligned 16-byte operations as possible.  The top-most
50C and bottom-most writes might need 8-byte operations.
51C
52C This variant rely on fast load movdqu, and uses it even for aligned operands,
53C in order to avoid the need for two separate loops.
54C
55C TODO
56C  * Could 2-limb wind-down code be simplified?
57C  * Improve basecase code, using shld/shrd for SBR, discrete integer shifts
58C    for other affected CPUs.
59
60C INPUT PARAMETERS
61define(`rp',  `%rdi')
62define(`ap',  `%rsi')
63define(`n',   `%rdx')
64define(`cnt', `%rcx')
65
66ASM_START()
67	TEXT
68	ALIGN(64)
69PROLOGUE(mpn_lshiftc)
70	FUNC_ENTRY(4)
71	movd	R32(%rcx), %xmm4
72	mov	$64, R32(%rax)
73	sub	R32(%rcx), R32(%rax)
74	movd	R32(%rax), %xmm5
75
76	neg	R32(%rcx)
77	mov	-8(ap,n,8), %rax
78	shr	R8(%rcx), %rax
79
80	pcmpeqb	%xmm3, %xmm3		C set to 111...111
81
82	cmp	$3, n
83	jle	L(bc)
84
85	lea	(rp,n,8), R32(%rcx)
86	test	$8, R8(%rcx)
87	jz	L(rp_aligned)
88
89C Do one initial limb in order to make rp aligned
90	movq	-8(ap,n,8), %xmm0
91	movq	-16(ap,n,8), %xmm1
92	psllq	%xmm4, %xmm0
93	psrlq	%xmm5, %xmm1
94	por	%xmm1, %xmm0
95	pxor	%xmm3, %xmm0
96	movq	%xmm0, -8(rp,n,8)
97	dec	n
98
99L(rp_aligned):
100	lea	1(n), %r8d
101
102	and	$6, R32(%r8)
103	jz	L(ba0)
104	cmp	$4, R32(%r8)
105	jz	L(ba4)
106	jc	L(ba2)
107L(ba6):	add	$-4, n
108	jmp	L(i56)
109L(ba0):	add	$-6, n
110	jmp	L(i70)
111L(ba4):	add	$-2, n
112	jmp	L(i34)
113L(ba2):	add	$-8, n
114	jle	L(end)
115
116	ALIGN(16)
117L(top):	movdqu	40(ap,n,8), %xmm1
118	movdqu	48(ap,n,8), %xmm0
119	psllq	%xmm4, %xmm0
120	psrlq	%xmm5, %xmm1
121	por	%xmm1, %xmm0
122	pxor	%xmm3, %xmm0
123	movdqa	%xmm0, 48(rp,n,8)
124L(i70):
125	movdqu	24(ap,n,8), %xmm1
126	movdqu	32(ap,n,8), %xmm0
127	psllq	%xmm4, %xmm0
128	psrlq	%xmm5, %xmm1
129	por	%xmm1, %xmm0
130	pxor	%xmm3, %xmm0
131	movdqa	%xmm0, 32(rp,n,8)
132L(i56):
133	movdqu	8(ap,n,8), %xmm1
134	movdqu	16(ap,n,8), %xmm0
135	psllq	%xmm4, %xmm0
136	psrlq	%xmm5, %xmm1
137	por	%xmm1, %xmm0
138	pxor	%xmm3, %xmm0
139	movdqa	%xmm0, 16(rp,n,8)
140L(i34):
141	movdqu	-8(ap,n,8), %xmm1
142	movdqu	(ap,n,8), %xmm0
143	psllq	%xmm4, %xmm0
144	psrlq	%xmm5, %xmm1
145	por	%xmm1, %xmm0
146	pxor	%xmm3, %xmm0
147	movdqa	%xmm0, (rp,n,8)
148	sub	$8, n
149	jg	L(top)
150
151L(end):	test	$1, R8(n)
152	jnz	L(end8)
153
154	movdqu	(ap), %xmm1
155	pxor	%xmm0, %xmm0
156	punpcklqdq  %xmm1, %xmm0
157	psllq	%xmm4, %xmm1
158	psrlq	%xmm5, %xmm0
159	por	%xmm1, %xmm0
160	pxor	%xmm3, %xmm0
161	movdqa	%xmm0, (rp)
162	FUNC_EXIT()
163	ret
164
165C Basecase
166	ALIGN(16)
167L(bc):	dec	R32(n)
168	jz	L(end8)
169
170	movq	(ap,n,8), %xmm1
171	movq	-8(ap,n,8), %xmm0
172	psllq	%xmm4, %xmm1
173	psrlq	%xmm5, %xmm0
174	por	%xmm1, %xmm0
175	pxor	%xmm3, %xmm0
176	movq	%xmm0, (rp,n,8)
177	sub	$2, R32(n)
178	jl	L(end8)
179	movq	8(ap), %xmm1
180	movq	(ap), %xmm0
181	psllq	%xmm4, %xmm1
182	psrlq	%xmm5, %xmm0
183	por	%xmm1, %xmm0
184	pxor	%xmm3, %xmm0
185	movq	%xmm0, 8(rp)
186
187L(end8):movq	(ap), %xmm0
188	psllq	%xmm4, %xmm0
189	pxor	%xmm3, %xmm0
190	movq	%xmm0, (rp)
191	FUNC_EXIT()
192	ret
193EPILOGUE()
194