xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/powerpc32/submul_1.asm (revision ce54336801cf28877c3414aa2fcb251dddd543a2)
1dnl  PowerPC-32 mpn_submul_1 -- Multiply a limb vector with a limb and subtract
2dnl  the result from a second limb vector.
3
4dnl  Copyright 1995, 1997, 1998, 2000, 2002, 2005 Free Software Foundation,
5dnl  Inc.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35C                cycles/limb
36C 603e:            ?
37C 604e:            7.5
38C 75x (G3):        9.3-15
39C 7400,7410 (G4):  9.3-15
40C 744x,745x (G4+): 10.5
41C power4/ppc970:   6.75
42C power5:          6.5
43
44C INPUT PARAMETERS
45C rp	r3
46C up	r4
47C n	r5
48C vl	r6
49
50C This is optimized for the PPC604.  See addmul_1.asm for additional comments.
51
52ASM_START()
53PROLOGUE(mpn_submul_1)
54	cmpwi	cr0,r5,9	C more than 9 limbs?
55	bgt	cr0,L(big)	C branch if more than 9 limbs
56
57	mtctr	r5
58	lwz	r0,0(r4)
59	mullw	r7,r0,r6
60	mulhwu	r10,r0,r6
61	lwz	r9,0(r3)
62	subfc	r8,r7,r9
63	addc	r7,r7,r8	C invert cy (r7 is junk)
64	addi	r3,r3,-4
65	bdz	L(end)
66L(loop):
67	lwzu	r0,4(r4)
68	stwu	r8,4(r3)
69	mullw	r8,r0,r6
70	adde	r7,r8,r10
71	mulhwu	r10,r0,r6
72	lwz	r9,4(r3)
73	addze	r10,r10
74	subfc	r8,r7,r9
75	addc	r7,r7,r8	C invert cy (r7 is junk)
76	bdnz	L(loop)
77L(end):	stw	r8,4(r3)
78	addze	r3,r10
79	blr
80
81L(big):	stwu	r1,-16(r1)
82	addi	r5,r5,-1
83	stw	r30,8(r1)
84	srwi	r0,r5,2
85	stw	r31,12(r1)
86	mtctr	r0
87
88	lwz	r7,0(r4)
89	mullw	r8,r7,r6
90	mulhwu	r0,r7,r6
91	lwz	r7,0(r3)
92	subfc	r7,r8,r7
93	addc	r8,r8,r7
94	stw	r7,0(r3)
95
96L(loopU):
97	lwz	r7,4(r4)
98	lwz	r12,8(r4)
99	lwz	r30,12(r4)
100	lwzu	r31,16(r4)
101	mullw	r8,r7,r6
102	mullw	r9,r12,r6
103	mullw	r10,r30,r6
104	mullw	r11,r31,r6
105	adde	r8,r8,r0	C add cy_limb
106	mulhwu	r0,r7,r6
107	lwz	r7,4(r3)
108	adde	r9,r9,r0
109	mulhwu	r0,r12,r6
110	lwz	r12,8(r3)
111	adde	r10,r10,r0
112	mulhwu	r0,r30,r6
113	lwz	r30,12(r3)
114	adde	r11,r11,r0
115	mulhwu	r0,r31,r6
116	lwz	r31,16(r3)
117	addze	r0,r0		C new cy_limb
118	subfc	r7,r8,r7
119	stw	r7,4(r3)
120	subfe	r12,r9,r12
121	stw	r12,8(r3)
122	subfe	r30,r10,r30
123	stw	r30,12(r3)
124	subfe	r31,r11,r31
125	stwu	r31,16(r3)
126	subfe	r11,r11,r11	C invert ...
127	addic	r11,r11,1	C ... carry
128	bdnz	L(loopU)
129
130	andi.	r31,r5,3
131	mtctr	r31
132	beq	cr0,L(endx)
133
134L(loopE):
135	lwzu	r7,4(r4)
136	mullw	r8,r7,r6
137	adde	r8,r8,r0	C add cy_limb
138	mulhwu	r0,r7,r6
139	lwz	r7,4(r3)
140	addze	r0,r0		C new cy_limb
141	subfc	r7,r8,r7
142	addc	r8,r8,r7
143	stwu	r7,4(r3)
144	bdnz	L(loopE)
145L(endx):
146	addze	r3,r0
147	lwz	r30,8(r1)
148	lwz	r31,12(r1)
149	addi	r1,r1,16
150	blr
151EPILOGUE(mpn_submul_1)
152