xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/fastsse/sec_tabselect.asm (revision cef8759bd76c1b621f8eab8faa6f208faabc2e15)
1dnl  AMD64 SSE mpn_sec_tabselect.
2
3dnl  Contributed to the GNU project by Torbjörn Granlund.
4
5dnl  Copyright 2011-2013 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35
36C	     cycles/limb     cycles/limb     cycles/limb
37C	      ali,evn n	     unal,evn n	      other cases
38C AMD K8,K9	 1.65		1.65		 1.8
39C AMD K10	 0.78		0.78		 0.85
40C AMD bd1	 0.80		0.91		 1.25
41C AMD bobcat	 2.15		2.15		 2.37
42C Intel P4	 2.5		2.5		 2.95
43C Intel core2	 1.17		1.25		 1.25
44C Intel NHM	 0.87		0.90		 0.90
45C Intel SBR	 0.63		0.79		 0.77
46C Intel atom	 4.3		 4.3		 4.3	slower than plain code
47C VIA nano	 1.4		 5.1		 3.14	too alignment dependent
48
49C NOTES
50C  * We only honour the least significant 32 bits of the `which' and `nents'
51C    arguments to allow efficient code using just SSE2.  We would need to
52C    either use the SSE4_1 pcmpeqq, or find some other SSE2 sequence.
53C  * We use movd for copying between xmm and plain registers, since old gas
54C    rejects movq.  But gas assembles movd as movq when given a 64-bit greg.
55
56define(`rp',     `%rdi')
57define(`tp',     `%rsi')
58define(`n',      `%rdx')
59define(`nents',  `%rcx')
60define(`which',  `%r8')
61
62define(`i',      `%r10')
63define(`j',      `%r9')
64
65C rax  rbx  rcx  rdx  rdi  rsi  rbp   r8   r9  r10  r11  r12  r13  r14  r15
66C          nents  n   rp   tab       which j    i   temp  *    *    *    *
67
68ABI_SUPPORT(DOS64)
69ABI_SUPPORT(STD64)
70
71ASM_START()
72	TEXT
73	ALIGN(16)
74PROLOGUE(mpn_sec_tabselect)
75	FUNC_ENTRY(4)
76IFDOS(`	mov	56(%rsp), %r8d	')
77
78	movd	which, %xmm8
79	pshufd	$0, %xmm8, %xmm8	C 4 `which' copies
80	mov	$1, R32(%rax)
81	movd	%rax, %xmm9
82	pshufd	$0, %xmm9, %xmm9	C 4 copies of 1
83
84	mov	n, j
85	add	$-8, j
86	js	L(outer_end)
87
88L(outer_top):
89	mov	nents, i
90	mov	tp, %r11
91	pxor	%xmm13, %xmm13
92	pxor	%xmm4, %xmm4
93	pxor	%xmm5, %xmm5
94	pxor	%xmm6, %xmm6
95	pxor	%xmm7, %xmm7
96	ALIGN(16)
97L(top):	movdqa	%xmm8, %xmm0
98	pcmpeqd	%xmm13, %xmm0
99	paddd	%xmm9, %xmm13
100	movdqu	0(tp), %xmm2
101	movdqu	16(tp), %xmm3
102	pand	%xmm0, %xmm2
103	pand	%xmm0, %xmm3
104	por	%xmm2, %xmm4
105	por	%xmm3, %xmm5
106	movdqu	32(tp), %xmm2
107	movdqu	48(tp), %xmm3
108	pand	%xmm0, %xmm2
109	pand	%xmm0, %xmm3
110	por	%xmm2, %xmm6
111	por	%xmm3, %xmm7
112	lea	(tp,n,8), tp
113	add	$-1, i
114	jne	L(top)
115
116	movdqu	%xmm4, 0(rp)
117	movdqu	%xmm5, 16(rp)
118	movdqu	%xmm6, 32(rp)
119	movdqu	%xmm7, 48(rp)
120
121	lea	64(%r11), tp
122	lea	64(rp), rp
123	add	$-8, j
124	jns	L(outer_top)
125L(outer_end):
126
127	test	$4, R8(n)
128	je	L(b0xx)
129L(b1xx):mov	nents, i
130	mov	tp, %r11
131	pxor	%xmm13, %xmm13
132	pxor	%xmm4, %xmm4
133	pxor	%xmm5, %xmm5
134	ALIGN(16)
135L(tp4):	movdqa	%xmm8, %xmm0
136	pcmpeqd	%xmm13, %xmm0
137	paddd	%xmm9, %xmm13
138	movdqu	0(tp), %xmm2
139	movdqu	16(tp), %xmm3
140	pand	%xmm0, %xmm2
141	pand	%xmm0, %xmm3
142	por	%xmm2, %xmm4
143	por	%xmm3, %xmm5
144	lea	(tp,n,8), tp
145	add	$-1, i
146	jne	L(tp4)
147	movdqu	%xmm4, 0(rp)
148	movdqu	%xmm5, 16(rp)
149	lea	32(%r11), tp
150	lea	32(rp), rp
151
152L(b0xx):test	$2, R8(n)
153	je	L(b00x)
154L(b01x):mov	nents, i
155	mov	tp, %r11
156	pxor	%xmm13, %xmm13
157	pxor	%xmm4, %xmm4
158	ALIGN(16)
159L(tp2):	movdqa	%xmm8, %xmm0
160	pcmpeqd	%xmm13, %xmm0
161	paddd	%xmm9, %xmm13
162	movdqu	0(tp), %xmm2
163	pand	%xmm0, %xmm2
164	por	%xmm2, %xmm4
165	lea	(tp,n,8), tp
166	add	$-1, i
167	jne	L(tp2)
168	movdqu	%xmm4, 0(rp)
169	lea	16(%r11), tp
170	lea	16(rp), rp
171
172L(b00x):test	$1, R8(n)
173	je	L(b000)
174L(b001):mov	nents, i
175	mov	tp, %r11
176	pxor	%xmm13, %xmm13
177	pxor	%xmm4, %xmm4
178	ALIGN(16)
179L(tp1):	movdqa	%xmm8, %xmm0
180	pcmpeqd	%xmm13, %xmm0
181	paddd	%xmm9, %xmm13
182	movq	0(tp), %xmm2
183	pand	%xmm0, %xmm2
184	por	%xmm2, %xmm4
185	lea	(tp,n,8), tp
186	add	$-1, i
187	jne	L(tp1)
188	movq	%xmm4, 0(rp)
189
190L(b000):FUNC_EXIT()
191	ret
192EPILOGUE()
193