xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/powerpc64/rshift.asm (revision ce54336801cf28877c3414aa2fcb251dddd543a2)
1dnl  PowerPC-64 mpn_rshift -- rp[] = up[] >> cnt
2
3dnl  Copyright 2003, 2005, 2010, 2011 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C                   cycles/limb
34C POWER3/PPC630          ?
35C POWER4/PPC970          ?
36C POWER5                 2.25
37C POWER6                 9.75
38C POWER7                 2.15
39
40C TODO
41C  * Try to reduce the number of needed live registers
42C  * Micro-optimise header code
43C  * Keep in synch with lshift.asm and lshiftc.asm
44
45C INPUT PARAMETERS
46define(`rp',  `r3')
47define(`up',  `r4')
48define(`n',   `r5')
49define(`cnt', `r6')
50
51define(`tnc',`r0')
52define(`u0',`r30')
53define(`u1',`r31')
54define(`retval',`r5')
55
56ASM_START()
57PROLOGUE(mpn_rshift)
58	std	r31, -8(r1)
59	std	r30, -16(r1)
60	subfic	tnc, cnt, 64
61C	sldi	r30, n, 3	C byte count corresponding to n
62C	add	rp, rp, r30	C rp = rp + n
63C	add	up, up, r30	C up = up + n
64	rldicl.	r30, n, 0,62	C r30 = n & 3, set cr0
65	cmpdi	cr6, r30, 2
66	addi	r31, n, 3	C compute count...
67	ld	r10, 0(up)	C load 1st limb for b00...b11
68	sld	retval, r10, tnc
69ifdef(`HAVE_ABI_mode32',
70`	rldicl	r31, r31, 62,34',	C ...branch count
71`	srdi	r31, r31, 2')	C ...for ctr
72	mtctr	r31		C copy count into ctr
73	beq	cr0, L(b00)
74	blt	cr6, L(b01)
75	ld	r11, 8(up)	C load 2nd limb for b10 and b11
76	beq	cr6, L(b10)
77
78	ALIGN(16)
79L(b11):	srd	r8, r10, cnt
80	sld	r9, r11, tnc
81	ld	u1, 16(up)
82	addi	up, up, 24
83	srd	r12, r11, cnt
84	sld	r7, u1, tnc
85	addi	rp, rp, -16
86	bdnz	L(gt3)
87
88	or	r11, r8, r9
89	srd	r8, u1, cnt
90	b	L(cj3)
91
92	ALIGN(16)
93L(gt3):	ld	u0, 0(up)
94	or	r11, r8, r9
95	srd	r8, u1, cnt
96	sld	r9, u0, tnc
97	ld	u1, 8(up)
98	or	r10, r12, r7
99	b	L(L11)
100
101	ALIGN(32)
102L(b10):	srd	r12, r10, cnt
103	addi	rp, rp, -24
104	sld	r7, r11, tnc
105	bdnz	L(gt2)
106
107	srd	r8, r11, cnt
108	or	r10, r12, r7
109	b	L(cj2)
110
111L(gt2):	ld	u0, 16(up)
112	srd	r8, r11, cnt
113	sld	r9, u0, tnc
114	ld	u1, 24(up)
115	or	r10, r12, r7
116	srd	r12, u0, cnt
117	sld	r7, u1, tnc
118	ld	u0, 32(up)
119	or	r11, r8, r9
120	addi	up, up, 16
121	b	L(L10)
122
123	ALIGN(16)
124L(b00):	ld	u1, 8(up)
125	srd	r12, r10, cnt
126	sld	r7, u1, tnc
127	ld	u0, 16(up)
128	srd	r8, u1, cnt
129	sld	r9, u0, tnc
130	ld	u1, 24(up)
131	or	r10, r12, r7
132	srd	r12, u0, cnt
133	sld	r7, u1, tnc
134	addi	rp, rp, -8
135	bdz	L(cj4)
136
137L(gt4):	addi	up, up, 32
138	ld	u0, 0(up)
139	or	r11, r8, r9
140	b	L(L00)
141
142	ALIGN(16)
143L(b01):	bdnz	L(gt1)
144	srd	r8, r10, cnt
145	std	r8, 0(rp)
146	b	L(ret)
147
148L(gt1):	ld	u0, 8(up)
149	srd	r8, r10, cnt
150	sld	r9, u0, tnc
151	ld	u1, 16(up)
152	srd	r12, u0, cnt
153	sld	r7, u1, tnc
154	ld	u0, 24(up)
155	or	r11, r8, r9
156	srd	r8, u1, cnt
157	sld	r9, u0, tnc
158	ld	u1, 32(up)
159	addi	up, up, 40
160	or	r10, r12, r7
161	bdz	L(end)
162
163	ALIGN(32)
164L(top):	srd	r12, u0, cnt
165	sld	r7, u1, tnc
166	ld	u0, 0(up)
167	std	r11, 0(rp)
168	or	r11, r8, r9
169L(L00):	srd	r8, u1, cnt
170	sld	r9, u0, tnc
171	ld	u1, 8(up)
172	std	r10, 8(rp)
173	or	r10, r12, r7
174L(L11):	srd	r12, u0, cnt
175	sld	r7, u1, tnc
176	ld	u0, 16(up)
177	std	r11, 16(rp)
178	or	r11, r8, r9
179L(L10):	srd	r8, u1, cnt
180	sld	r9, u0, tnc
181	ld	u1, 24(up)
182	addi	up, up, 32
183	std	r10, 24(rp)
184	addi	rp, rp, 32
185	or	r10, r12, r7
186	bdnz	L(top)
187
188	ALIGN(32)
189L(end):	srd	r12, u0, cnt
190	sld	r7, u1, tnc
191	std	r11, 0(rp)
192L(cj4):	or	r11, r8, r9
193	srd	r8, u1, cnt
194	std	r10, 8(rp)
195L(cj3):	or	r10, r12, r7
196	std	r11, 16(rp)
197L(cj2):	std	r10, 24(rp)
198	std	r8, 32(rp)
199
200L(ret):	ld	r31, -8(r1)
201	ld	r30, -16(r1)
202ifdef(`HAVE_ABI_mode32',
203`	srdi	r3, retval, 32
204	mr	r4, retval
205',`	mr	r3, retval')
206	blr
207EPILOGUE()
208