xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/powerpc32/vmx/copyi.asm (revision cef8759bd76c1b621f8eab8faa6f208faabc2e15)
1dnl  PowerPC-32/VMX and PowerPC-64/VMX mpn_copyi.
2
3dnl  Copyright 2006 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C                16-byte coaligned      unaligned
34C                   cycles/limb        cycles/limb
35C 7400,7410 (G4):       0.5                0.64
36C 744x,745x (G4+):      0.75               0.82
37C 970 (G5):             0.78               1.02		(64-bit limbs)
38
39C STATUS
40C  * Works for all sizes and alignments.
41
42C TODO
43C  * Optimize unaligned case.  Some basic tests with 2-way and 4-way unrolling
44C    indicate that we can reach 0.56 c/l for 7400, 0.75 c/l for 745x, and 0.80
45C    c/l for 970.
46C  * Consider using VMX instructions also for head and tail, by using some
47C    read-modify-write tricks.
48C  * The VMX code is used from the smallest sizes it handles, but measurements
49C    show a large speed bump at the cutoff points.  Small copying (perhaps
50C    using some read-modify-write technique) should be optimized.
51C  * Make an mpn_com based on this code.
52
53define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8))
54define(`LIMBS_PER_VR',  eval(16/GMP_LIMB_BYTES))
55define(`LIMBS_PER_2VR', eval(32/GMP_LIMB_BYTES))
56
57
58ifelse(GMP_LIMB_BITS,32,`
59	define(`LIMB32',`	$1')
60	define(`LIMB64',`')
61',`
62	define(`LIMB32',`')
63	define(`LIMB64',`	$1')
64')
65
66C INPUT PARAMETERS
67define(`rp',	`r3')
68define(`up',	`r4')
69define(`n',	`r5')
70
71define(`us',	`v4')
72
73
74ASM_START()
75PROLOGUE(mpn_copyi)
76
77LIMB32(`cmpi	cr7, n, 11	')
78LIMB64(`cmpdi	cr7, n, 5	')
79	bge	cr7, L(big)
80
81	or.	r0, n, n
82	beqlr	cr0
83
84C Handle small cases with plain operations
85	mtctr	n
86L(topS):
87LIMB32(`lwz	r0, 0(up)	')
88LIMB64(`ld	r0, 0(up)	')
89	addi	up, up, GMP_LIMB_BYTES
90LIMB32(`stw	r0, 0(rp)	')
91LIMB64(`std	r0, 0(rp)	')
92	addi	rp, rp, GMP_LIMB_BYTES
93	bdnz	L(topS)
94	blr
95
96C Handle large cases with VMX operations
97L(big):
98	mfspr	r12, 256
99	oris	r0, r12, 0xf800		C Set VRSAVE bit 0-4
100	mtspr	256, r0
101
102LIMB32(`rlwinm.	r7, rp, 30,30,31')	C (rp >> 2) mod 4
103LIMB64(`rlwinm.	r7, rp, 29,31,31')	C (rp >> 3) mod 2
104	beq	L(rp_aligned)
105
106	subfic	r7, r7, LIMBS_PER_VR
107	subf	n, r7, n
108L(top0):
109LIMB32(`lwz	r0, 0(up)	')
110LIMB64(`ld	r0, 0(up)	')
111	addi	up, up, GMP_LIMB_BYTES
112LIMB32(`addic.	r7, r7, -1	')
113LIMB32(`stw	r0, 0(rp)	')
114LIMB64(`std	r0, 0(rp)	')
115	addi	rp, rp, GMP_LIMB_BYTES
116LIMB32(`bne	L(top0)		')
117
118L(rp_aligned):
119
120LIMB32(`rlwinm.	r0, up, 30,30,31')	C (up >> 2) mod 4
121LIMB64(`rlwinm.	r0, up, 29,31,31')	C (up >> 3) mod 2
122
123LIMB64(`srdi	r7, n, 2	')	C loop count corresponding to n
124LIMB32(`srwi	r7, n, 3	')	C loop count corresponding to n
125	mtctr	r7			C copy n to count register
126
127	li	r10, 16
128
129	beq	L(up_aligned)
130
131	lvsl	us, 0, up
132
133LIMB32(`andi.	r0, n, 0x4	')
134LIMB64(`andi.	r0, n, 0x2	')
135	beq	L(1)
136	lvx	v0, 0, up
137	lvx	v2, r10, up
138	vperm	v3, v0, v2, us
139	stvx	v3, 0, rp
140	addi	up, up, 32
141	addi	rp, rp, 16
142	b	L(lpu)
143L(1):	lvx	v2, 0, up
144	addi	up, up, 16
145	b	L(lpu)
146
147	ALIGN(32)
148L(lpu):	lvx	v0, 0, up
149	vperm	v3, v2, v0, us
150	stvx	v3, 0, rp
151	lvx	v2, r10, up
152	addi	up, up, 32
153	vperm	v3, v0, v2, us
154	stvx	v3, r10, rp
155	addi	rp, rp, 32
156	bdnz	L(lpu)
157
158	addi	up, up, -16
159	b	L(tail)
160
161L(up_aligned):
162
163LIMB32(`andi.	r0, n, 0x4	')
164LIMB64(`andi.	r0, n, 0x2	')
165	beq	L(lpa)
166	lvx	v0, 0,   up
167	stvx	v0, 0,   rp
168	addi	up, up, 16
169	addi	rp, rp, 16
170	b	L(lpa)
171
172	ALIGN(32)
173L(lpa):	lvx	v0, 0,   up
174	lvx	v1, r10, up
175	addi	up, up, 32
176	nop
177	stvx	v0, 0,   rp
178	stvx	v1, r10, rp
179	addi	rp, rp, 32
180	bdnz	L(lpa)
181
182L(tail):
183LIMB32(`rlwinm.	r7, n, 0,30,31	')	C r7 = n mod 4
184LIMB64(`rlwinm.	r7, n, 0,31,31	')	C r7 = n mod 2
185	beq	L(ret)
186LIMB32(`li	r10, 0		')
187L(top2):
188LIMB32(`lwzx	r0, r10, up	')
189LIMB64(`ld	r0, 0(up)	')
190LIMB32(`addic.	r7, r7, -1	')
191LIMB32(`stwx	r0, r10, rp	')
192LIMB64(`std	r0, 0(rp)	')
193LIMB32(`addi	r10, r10, GMP_LIMB_BYTES')
194LIMB32(`bne	L(top2)		')
195
196L(ret):	mtspr	256, r12
197	blr
198EPILOGUE()
199