xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/powerpc64/lshiftc.asm (revision ce54336801cf28877c3414aa2fcb251dddd543a2)
1dnl  PowerPC-64 mpn_lshiftc -- rp[] = ~up[] << cnt
2
3dnl  Copyright 2003, 2005, 2010 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C                   cycles/limb
34C POWER3/PPC630          ?
35C POWER4/PPC970          ?
36C POWER5                 2.25
37C POWER6                 9.5
38C POWER7                 2.15
39
40C TODO
41C  * Try to reduce the number of needed live registers
42C  * Micro-optimise header code
43C  * Keep in synch with lshift.asm and rshift.asm
44C  * Could the long-scheduled std insns be less scheduled?
45
46C INPUT PARAMETERS
47define(`rp',  `r3')
48define(`up',  `r4')
49define(`n',   `r5')
50define(`cnt', `r6')
51
52define(`tnc',`r0')
53define(`u0',`r30')
54define(`u1',`r31')
55define(`retval',`r5')
56
57ASM_START()
58PROLOGUE(mpn_lshiftc)
59	std	r31, -8(r1)
60	std	r30, -16(r1)
61	subfic	tnc, cnt, 64
62	sldi	r7, n, 3	C byte count corresponding to n
63	add	up, up, r7	C up = up + n
64	add	rp, rp, r7	C rp = rp + n
65	rldicl.	r30, n, 0,62	C r30 = n & 3, set cr0
66	cmpdi	cr6, r30, 2
67	addi	r31, n, 3	C compute count...
68	ld	r10, -8(up)	C load 1st limb for b00...b11
69	srd	retval, r10, tnc
70	srdi	r31, r31, 2	C ...for ctr
71	mtctr	r31		C copy count into ctr
72	beq	cr0, L(b00)
73	blt	cr6, L(b01)
74	ld	r11, -16(up)	C load 2nd limb for b10 and b11
75	beq	cr6, L(b10)
76
77	ALIGN(16)
78L(b11):	sld	r8, r10, cnt
79	srd	r9, r11, tnc
80	ld	u1, -24(up)
81	addi	up, up, -24
82	sld	r12, r11, cnt
83	srd	r7, u1, tnc
84	addi	rp, rp, 16
85	bdnz	L(gt3)
86
87	nor	r11, r8, r9
88	sld	r8, u1, cnt
89	nor	r8, r8, r8
90	b	L(cj3)
91
92	ALIGN(16)
93L(gt3):	ld	u0, -8(up)
94	nor	r11, r8, r9
95	sld	r8, u1, cnt
96	srd	r9, u0, tnc
97	ld	u1, -16(up)
98	nor	r10, r12, r7
99	b	L(L11)
100
101	ALIGN(32)
102L(b10):	sld	r12, r10, cnt
103	addi	rp, rp, 24
104	srd	r7, r11, tnc
105	bdnz	L(gt2)
106
107	sld	r8, r11, cnt
108	nor	r10, r12, r7
109	nor	r8, r8, r8
110	b	L(cj2)
111
112L(gt2):	ld	u0, -24(up)
113	sld	r8, r11, cnt
114	srd	r9, u0, tnc
115	ld	u1, -32(up)
116	nor	r10, r12, r7
117	sld	r12, u0, cnt
118	srd	r7, u1, tnc
119	ld	u0, -40(up)
120	nor	r11, r8, r9
121	addi	up, up, -16
122	b	L(L10)
123
124	ALIGN(16)
125L(b00):	ld	u1, -16(up)
126	sld	r12, r10, cnt
127	srd	r7, u1, tnc
128	ld	u0, -24(up)
129	sld	r8, u1, cnt
130	srd	r9, u0, tnc
131	ld	u1, -32(up)
132	nor	r10, r12, r7
133	sld	r12, u0, cnt
134	srd	r7, u1, tnc
135	addi	rp, rp, 8
136	bdz	L(cj4)
137
138L(gt4):	addi	up, up, -32
139	ld	u0, -8(up)
140	nor	r11, r8, r9
141	b	L(L00)
142
143	ALIGN(16)
144L(b01):	bdnz	L(gt1)
145	sld	r8, r10, cnt
146	nor	r8, r8, r8
147	std	r8, -8(rp)
148	b	L(ret)
149
150L(gt1):	ld	u0, -16(up)
151	sld	r8, r10, cnt
152	srd	r9, u0, tnc
153	ld	u1, -24(up)
154	sld	r12, u0, cnt
155	srd	r7, u1, tnc
156	ld	u0, -32(up)
157	nor	r11, r8, r9
158	sld	r8, u1, cnt
159	srd	r9, u0, tnc
160	ld	u1, -40(up)
161	addi	up, up, -40
162	nor	r10, r12, r7
163	bdz	L(end)
164
165	ALIGN(32)
166L(top):	sld	r12, u0, cnt
167	srd	r7, u1, tnc
168	ld	u0, -8(up)
169	std	r11, -8(rp)
170	nor	r11, r8, r9
171L(L00):	sld	r8, u1, cnt
172	srd	r9, u0, tnc
173	ld	u1, -16(up)
174	std	r10, -16(rp)
175	nor	r10, r12, r7
176L(L11):	sld	r12, u0, cnt
177	srd	r7, u1, tnc
178	ld	u0, -24(up)
179	std	r11, -24(rp)
180	nor	r11, r8, r9
181L(L10):	sld	r8, u1, cnt
182	srd	r9, u0, tnc
183	ld	u1, -32(up)
184	addi	up, up, -32
185	std	r10, -32(rp)
186	addi	rp, rp, -32
187	nor	r10, r12, r7
188	bdnz	L(top)
189
190	ALIGN(32)
191L(end):	sld	r12, u0, cnt
192	srd	r7, u1, tnc
193	std	r11, -8(rp)
194L(cj4):	nor	r11, r8, r9
195	sld	r8, u1, cnt
196	std	r10, -16(rp)
197	nor	r8, r8, r8
198L(cj3):	nor	r10, r12, r7
199	std	r11, -24(rp)
200L(cj2):	std	r10, -32(rp)
201	std	r8, -40(rp)
202
203L(ret):	ld	r31, -8(r1)
204	ld	r30, -16(r1)
205ifdef(`HAVE_ABI_mode32',
206`	srdi	r3, retval, 32
207	mr	r4, retval
208',`	mr	r3, retval')
209	blr
210EPILOGUE()
211