xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/fastsse/lshift-movdqu2.asm (revision 7d62b00eb9ad855ffcd7da46b41e23feb5476fac)
1dnl  AMD64 mpn_lshift optimised for CPUs with fast SSE including fast movdqu.
2
3dnl  Contributed to the GNU project by Torbjorn Granlund.
4
5dnl  Copyright 2010-2012 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35
36C	     cycles/limb     cycles/limb     cycles/limb    good
37C              aligned	      unaligned	      best seen	   for cpu?
38C AMD K8,K9	 3		 3		 2.35	  no, use shl/shr
39C AMD K10	 1.5-1.8	 1.5-1.8	 1.33	  yes
40C AMD bd1	 1.7-1.9	 1.7-1.9	 1.33	  yes
41C AMD bobcat	 3.17		 3.17			  yes, bad for n < 20
42C Intel P4	 4.67		 4.67		 2.7	  no, slow movdqu
43C Intel core2	 2.15		 2.15		 1.25	  no, use shld/shrd
44C Intel NHM	 1.66		 1.66		 1.25	  no, use shld/shrd
45C Intel SBR	 1.3		 1.3		 1.25	  yes, bad for n = 4-6
46C Intel atom	11.7		11.7		 4.5	  no
47C VIA nano	 5.7		 5.95		 2.0	  no, slow movdqu
48
49C We try to do as many aligned 16-byte operations as possible.  The top-most
50C and bottom-most writes might need 8-byte operations.
51C
52C This variant rely on fast load movdqu, and uses it even for aligned operands,
53C in order to avoid the need for two separate loops.
54C
55C TODO
56C  * Could 2-limb wind-down code be simplified?
57C  * Improve basecase code, using shld/shrd for SBR, discrete integer shifts
58C    for other affected CPUs.
59
60C INPUT PARAMETERS
61define(`rp',  `%rdi')
62define(`ap',  `%rsi')
63define(`n',   `%rdx')
64define(`cnt', `%rcx')
65
66ASM_START()
67	TEXT
68	ALIGN(64)
69PROLOGUE(mpn_lshift)
70	FUNC_ENTRY(4)
71	movd	R32(%rcx), %xmm4
72	mov	$64, R32(%rax)
73	sub	R32(%rcx), R32(%rax)
74	movd	R32(%rax), %xmm5
75
76	neg	R32(%rcx)
77	mov	-8(ap,n,8), %rax
78	shr	R8(%rcx), %rax
79
80	cmp	$3, n
81	jle	L(bc)
82
83	lea	(rp,n,8), R32(%rcx)
84	test	$8, R8(%rcx)
85	jz	L(rp_aligned)
86
87C Do one initial limb in order to make rp aligned
88	movq	-8(ap,n,8), %xmm0
89	movq	-16(ap,n,8), %xmm1
90	psllq	%xmm4, %xmm0
91	psrlq	%xmm5, %xmm1
92	por	%xmm1, %xmm0
93	movq	%xmm0, -8(rp,n,8)
94	dec	n
95
96L(rp_aligned):
97	lea	1(n), %r8d
98
99	and	$6, R32(%r8)
100	jz	L(ba0)
101	cmp	$4, R32(%r8)
102	jz	L(ba4)
103	jc	L(ba2)
104L(ba6):	add	$-4, n
105	jmp	L(i56)
106L(ba0):	add	$-6, n
107	jmp	L(i70)
108L(ba4):	add	$-2, n
109	jmp	L(i34)
110L(ba2):	add	$-8, n
111	jle	L(end)
112
113	ALIGN(16)
114L(top):	movdqu	40(ap,n,8), %xmm1
115	movdqu	48(ap,n,8), %xmm0
116	psllq	%xmm4, %xmm0
117	psrlq	%xmm5, %xmm1
118	por	%xmm1, %xmm0
119	movdqa	%xmm0, 48(rp,n,8)
120L(i70):
121	movdqu	24(ap,n,8), %xmm1
122	movdqu	32(ap,n,8), %xmm0
123	psllq	%xmm4, %xmm0
124	psrlq	%xmm5, %xmm1
125	por	%xmm1, %xmm0
126	movdqa	%xmm0, 32(rp,n,8)
127L(i56):
128	movdqu	8(ap,n,8), %xmm1
129	movdqu	16(ap,n,8), %xmm0
130	psllq	%xmm4, %xmm0
131	psrlq	%xmm5, %xmm1
132	por	%xmm1, %xmm0
133	movdqa	%xmm0, 16(rp,n,8)
134L(i34):
135	movdqu	-8(ap,n,8), %xmm1
136	movdqu	(ap,n,8), %xmm0
137	psllq	%xmm4, %xmm0
138	psrlq	%xmm5, %xmm1
139	por	%xmm1, %xmm0
140	movdqa	%xmm0, (rp,n,8)
141	sub	$8, n
142	jg	L(top)
143
144L(end):	test	$1, R8(n)
145	jnz	L(end8)
146
147	movdqu	(ap), %xmm1
148	pxor	%xmm0, %xmm0
149	punpcklqdq  %xmm1, %xmm0
150	psllq	%xmm4, %xmm1
151	psrlq	%xmm5, %xmm0
152	por	%xmm1, %xmm0
153	movdqa	%xmm0, (rp)
154	FUNC_EXIT()
155	ret
156
157C Basecase
158	ALIGN(16)
159L(bc):	dec	R32(n)
160	jz	L(end8)
161
162	movq	(ap,n,8), %xmm1
163	movq	-8(ap,n,8), %xmm0
164	psllq	%xmm4, %xmm1
165	psrlq	%xmm5, %xmm0
166	por	%xmm1, %xmm0
167	movq	%xmm0, (rp,n,8)
168	sub	$2, R32(n)
169	jl	L(end8)
170	movq	8(ap), %xmm1
171	movq	(ap), %xmm0
172	psllq	%xmm4, %xmm1
173	psrlq	%xmm5, %xmm0
174	por	%xmm1, %xmm0
175	movq	%xmm0, 8(rp)
176
177L(end8):movq	(ap), %xmm0
178	psllq	%xmm4, %xmm0
179	movq	%xmm0, (rp)
180	FUNC_EXIT()
181	ret
182EPILOGUE()
183