xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/fastsse/lshiftc.asm (revision 7bdf38e5b7a28439665f2fdeff81e36913eef7dd)
1dnl  AMD64 mpn_lshiftc optimised for CPUs with fast SSE.
2
3dnl  Contributed to the GNU project by David Harvey and Torbjorn Granlund.
4
5dnl  Copyright 2010-2012, 2018 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35
36C	     cycles/limb	     cycles/limb	      good
37C          16-byte aligned         16-byte unaligned	    for cpu?
38C AMD K8,K9	 ?			 ?
39C AMD K10	 1.85  (1.635)		 1.9   (1.67)		Y
40C AMD bd1	 1.82  (1.75)		 1.82  (1.75)		Y
41C AMD bobcat	 4.5			 4.5
42C Intel P4	 3.6   (3.125)		 3.6   (3.125)		Y
43C Intel core2	 2.05  (1.67)		 2.55  (1.75)
44C Intel NHM	 2.05  (1.875)		 2.6   (2.25)
45C Intel SBR	 1.55  (1.44)		 2     (1.57)		Y
46C Intel atom	 ?			 ?
47C VIA nano	 2.5   (2.5)		 2.5   (2.5)		Y
48
49C We try to do as many 16-byte operations as possible.  The top-most and
50C bottom-most writes might need 8-byte operations.  We always write using
51C 16-byte operations, we read with both 8-byte and 16-byte operations.
52
53C There are two inner-loops, one for when rp = ap (mod 16) and one when this is
54C not true.  The aligned case reads 16+8 bytes, the unaligned case reads
55C 16+8+X bytes, where X is 8 or 16 depending on how punpcklqdq is implemented.
56
57C This is not yet great code:
58C   (1) The unaligned case makes too many reads.
59C   (2) We should do some unrolling, at least 2-way.
60C With 2-way unrolling but no scheduling we reach 1.5 c/l on K10 and 2 c/l on
61C Nano.
62
63C INPUT PARAMETERS
64define(`rp',  `%rdi')
65define(`ap',  `%rsi')
66define(`n',   `%rdx')
67define(`cnt', `%rcx')
68
69ASM_START()
70	TEXT
71	ALIGN(16)
72PROLOGUE(mpn_lshiftc)
73	FUNC_ENTRY(4)
74	movd	R32(%rcx), %xmm4
75	mov	$64, R32(%rax)
76	sub	R32(%rcx), R32(%rax)
77	movd	R32(%rax), %xmm5
78
79	neg	R32(%rcx)
80	mov	-8(ap,n,8), %rax
81	shr	R8(%rcx), %rax
82
83	pcmpeqb	%xmm2, %xmm2		C set to 111...111
84
85	cmp	$2, n
86	jle	L(le2)
87
88	lea	(rp,n,8), R32(%rcx)
89	test	$8, R8(%rcx)
90	je	L(rp_aligned)
91
92C Do one initial limb in order to make rp aligned
93	movq	-8(ap,n,8), %xmm0
94	movq	-16(ap,n,8), %xmm1
95	psllq	%xmm4, %xmm0
96	psrlq	%xmm5, %xmm1
97	por	%xmm1, %xmm0
98	pxor	%xmm2, %xmm0
99	movq	%xmm0, -8(rp,n,8)
100	dec	n
101
102L(rp_aligned):
103	lea	(ap,n,8), R32(%rcx)
104	test	$8, R8(%rcx)
105	je	L(aent)
106	jmp	L(uent)
107C *****************************************************************************
108
109C Handle the case when ap != rp (mod 16).
110
111	ALIGN(16)
112L(utop):movq	(ap,n,8), %xmm1
113	punpcklqdq  8(ap,n,8), %xmm1
114	movdqa	-8(ap,n,8), %xmm0
115	psllq	%xmm4, %xmm1
116	psrlq	%xmm5, %xmm0
117	por	%xmm1, %xmm0
118	pxor	%xmm2, %xmm0
119	movdqa	%xmm0, (rp,n,8)
120L(uent):sub	$2, n
121	ja	L(utop)
122
123	jne	L(end8)
124
125	movq	(ap), %xmm1
126	pxor	%xmm0, %xmm0
127	punpcklqdq  %xmm1, %xmm0
128	punpcklqdq  8(ap), %xmm1
129	psllq	%xmm4, %xmm1
130	psrlq	%xmm5, %xmm0
131	por	%xmm1, %xmm0
132	pxor	%xmm2, %xmm0
133	movdqa	%xmm0, (rp)
134	FUNC_EXIT()
135	ret
136C *****************************************************************************
137
138C Handle the case when ap = rp (mod 16).
139
140	ALIGN(16)
141L(atop):movdqa	(ap,n,8), %xmm0		C xmm0 = B*ap[n-1] + ap[n-2]
142	movq	-8(ap,n,8), %xmm1	C xmm1 = ap[n-3]
143	punpcklqdq  %xmm0, %xmm1	C xmm1 = B*ap[n-2] + ap[n-3]
144	psllq	%xmm4, %xmm0
145	psrlq	%xmm5, %xmm1
146	por	%xmm1, %xmm0
147	pxor	%xmm2, %xmm0
148	movdqa	%xmm0, (rp,n,8)
149L(aent):sub	$2, n
150	ja	L(atop)
151
152	jne	L(end8)
153
154	movdqa	(ap), %xmm0
155	pxor	%xmm1, %xmm1
156	punpcklqdq  %xmm0, %xmm1
157	psllq	%xmm4, %xmm0
158	psrlq	%xmm5, %xmm1
159	por	%xmm1, %xmm0
160	pxor	%xmm2, %xmm0
161	movdqa	%xmm0, (rp)
162	FUNC_EXIT()
163	ret
164C *****************************************************************************
165
166	ALIGN(16)
167L(le2):	jne	L(end8)
168
169	movq	8(ap), %xmm0
170	movq	(ap), %xmm1
171	psllq	%xmm4, %xmm0
172	psrlq	%xmm5, %xmm1
173	por	%xmm1, %xmm0
174	pxor	%xmm2, %xmm0
175	movq	%xmm0, 8(rp)
176
177L(end8):movq	(ap), %xmm0
178	psllq	%xmm4, %xmm0
179	pxor	%xmm2, %xmm0
180	movq	%xmm0, (rp)
181	FUNC_EXIT()
182	ret
183EPILOGUE()
184