xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/sparc64/ultrasparc1234/addmul_1.asm (revision 413d532bcc3f62d122e56d92e13ac64825a40baf)
1dnl  SPARC v9 64-bit mpn_addmul_1 -- Multiply a limb vector with a limb and add
2dnl  the result to a second limb vector.
3
4dnl  Copyright 1998, 2000, 2001, 2002, 2003, 2004 Free Software Foundation,
5dnl  Inc.
6
7dnl  This file is part of the GNU MP Library.
8
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of the GNU Lesser General Public License as published
11dnl  by the Free Software Foundation; either version 3 of the License, or (at
12dnl  your option) any later version.
13
14dnl  The GNU MP Library is distributed in the hope that it will be useful, but
15dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
17dnl  License for more details.
18
19dnl  You should have received a copy of the GNU Lesser General Public License
20dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
21
22include(`../config.m4')
23
24C		   cycles/limb
25C UltraSPARC 1&2:     14
26C UltraSPARC 3:	      17.5
27
28C Algorithm: We use eight floating-point multiplies per limb product, with the
29C invariant v operand split into four 16-bit pieces, and the up operand split
30C into 32-bit pieces.  We sum pairs of 48-bit partial products using
31C floating-point add, then convert the four 49-bit product-sums and transfer
32C them to the integer unit.
33
34C Possible optimizations:
35C   0. Rewrite to use algorithm of mpn_addmul_2.
36C   1. Align the stack area where we transfer the four 49-bit product-sums
37C      to a 32-byte boundary.  That would minimize the cache collision.
38C      (UltraSPARC-1/2 use a direct-mapped cache.)  (Perhaps even better would
39C      be to align the area to map to the area immediately before up?)
40C   2. Sum the 4 49-bit quantities using 32-bit operations, as in the
41C      develop mpn_addmul_2.  This would save many integer instructions.
42C   3. Unrolling.  Questionable if it is worth the code expansion, given that
43C      it could only save 1 cycle/limb.
44C   4. Specialize for particular v values.  If its upper 32 bits are zero, we
45C      could save many operations, in the FPU (fmuld), but more so in the IEU
46C      since we'll be summing 48-bit quantities, which might be simpler.
47C   5. Ideally, we should schedule the f2/f3 and f4/f5 RAW further apart, and
48C      the i00,i16,i32,i48 RAW less apart.  The latter apart-scheduling should
49C      not be greater than needed for L2 cache latency, and also not so great
50C      that i16 needs to be copied.
51C   6. Avoid performing mem+fa+fm in the same cycle, at least not when we want
52C      to get high IEU bandwidth.  (12 of the 14 cycles will be free for 2 IEU
53C      ops.)
54
55C Instruction classification (as per UltraSPARC-1/2 functional units):
56C    8 FM
57C   10 FA
58C   12 MEM
59C   10 ISHIFT + 14 IADDLOG
60C    1 BRANCH
61C   55 insns totally (plus one mov insn that should be optimized out)
62
63C The loop executes 56 instructions in 14 cycles on UltraSPARC-1/2, i.e we
64C sustain the peak execution rate of 4 instructions/cycle.
65
66C INPUT PARAMETERS
67C rp	i0
68C up	i1
69C n	i2
70C v	i3
71
72ASM_START()
73	REGISTER(%g2,#scratch)
74	REGISTER(%g3,#scratch)
75
76define(`p00', `%f8') define(`p16',`%f10') define(`p32',`%f12') define(`p48',`%f14')
77define(`r32',`%f16') define(`r48',`%f18') define(`r64',`%f20') define(`r80',`%f22')
78define(`v00',`%f24') define(`v16',`%f26') define(`v32',`%f28') define(`v48',`%f30')
79define(`u00',`%f32') define(`u32', `%f34')
80define(`a00',`%f36') define(`a16',`%f38') define(`a32',`%f40') define(`a48',`%f42')
81define(`cy',`%g1')
82define(`rlimb',`%g3')
83define(`i00',`%l0') define(`i16',`%l1') define(`i32',`%l2') define(`i48',`%l3')
84define(`xffffffff',`%l7')
85define(`xffff',`%o0')
86
87PROLOGUE(mpn_addmul_1)
88
89C Initialization.  (1) Split v operand into four 16-bit chunks and store them
90C as IEEE double in fp registers.  (2) Clear upper 32 bits of fp register pairs
91C f2 and f4.  (3) Store masks in registers aliased to `xffff' and `xffffffff'.
92
93	save	%sp, -256, %sp
94	mov	-1, %g4
95	srlx	%g4, 48, xffff		C store mask in register `xffff'
96	and	%i3, xffff, %g2
97	stx	%g2, [%sp+2223+0]
98	srlx	%i3, 16, %g3
99	and	%g3, xffff, %g3
100	stx	%g3, [%sp+2223+8]
101	srlx	%i3, 32, %g2
102	and	%g2, xffff, %g2
103	stx	%g2, [%sp+2223+16]
104	srlx	%i3, 48, %g3
105	stx	%g3, [%sp+2223+24]
106	srlx	%g4, 32, xffffffff	C store mask in register `xffffffff'
107
108	sllx	%i2, 3, %i2
109	mov	0, cy			C clear cy
110	add	%i0, %i2, %i0
111	add	%i1, %i2, %i1
112	neg	%i2
113	add	%i1, 4, %i5
114	add	%i0, -32, %i4
115	add	%i0, -16, %i0
116
117	ldd	[%sp+2223+0], v00
118	ldd	[%sp+2223+8], v16
119	ldd	[%sp+2223+16], v32
120	ldd	[%sp+2223+24], v48
121	ld	[%sp+2223+0],%f2	C zero f2
122	ld	[%sp+2223+0],%f4	C zero f4
123	ld	[%i5+%i2], %f3		C read low 32 bits of up[i]
124	ld	[%i1+%i2], %f5		C read high 32 bits of up[i]
125	fxtod	v00, v00
126	fxtod	v16, v16
127	fxtod	v32, v32
128	fxtod	v48, v48
129
130C Start real work.  (We sneakingly read f3 and f5 above...)
131C The software pipeline is very deep, requiring 4 feed-in stages.
132
133	fxtod	%f2, u00
134	fxtod	%f4, u32
135	fmuld	u00, v00, a00
136	fmuld	u00, v16, a16
137	fmuld	u00, v32, p32
138	fmuld	u32, v00, r32
139	fmuld	u00, v48, p48
140	addcc	%i2, 8, %i2
141	bnz,pt	%xcc, .L_two_or_more
142	fmuld	u32, v16, r48
143
144.L_one:
145	fmuld	u32, v32, r64	C FIXME not urgent
146	faddd	p32, r32, a32
147	fdtox	a00, a00
148	faddd	p48, r48, a48
149	fmuld	u32, v48, r80	C FIXME not urgent
150	fdtox	a16, a16
151	fdtox	a32, a32
152	fdtox	a48, a48
153	std	a00, [%sp+2223+0]
154	std	a16, [%sp+2223+8]
155	std	a32, [%sp+2223+16]
156	std	a48, [%sp+2223+24]
157	add	%i2, 8, %i2
158
159	fdtox	r64, a00
160	ldx	[%i0+%i2], rlimb	C read rp[i]
161	fdtox	r80, a16
162	ldx	[%sp+2223+0], i00
163	ldx	[%sp+2223+8], i16
164	ldx	[%sp+2223+16], i32
165	ldx	[%sp+2223+24], i48
166	std	a00, [%sp+2223+0]
167	std	a16, [%sp+2223+8]
168	add	%i2, 8, %i2
169
170	srlx	rlimb, 32, %g4		C HI(rlimb)
171	and	rlimb, xffffffff, %g5	C LO(rlimb)
172	add	i00, %g5, %g5		C i00+ now in g5
173	ldx	[%sp+2223+0], i00
174	srlx	i16, 48, %l4		C (i16 >> 48)
175	mov	i16, %g2
176	ldx	[%sp+2223+8], i16
177	srlx	i48, 16, %l5		C (i48 >> 16)
178	add	i32, %g4, %g4		C i32+ now in g4
179	sllx	i48, 32, %l6		C (i48 << 32)
180	srlx	%g4, 32, %o3		C (i32 >> 32)
181	add	%l5, %l4, %o1		C hi64- in %o1
182	std	a00, [%sp+2223+0]
183	sllx	%g4, 16, %o2		C (i32 << 16)
184	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT
185	std	a16, [%sp+2223+8]
186	sllx	%o1, 48, %o3		C (hi64 << 48)
187	add	%g2, %o2, %o2		C mi64- in %o2
188	add	%l6, %o2, %o2		C mi64- in %o2
189	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT
190	add	cy, %g5, %o4		C x = prev(i00) + cy
191	b	.L_out_1
192	add	%i2, 8, %i2
193
194.L_two_or_more:
195	ld	[%i5+%i2], %f3		C read low 32 bits of up[i]
196	fmuld	u32, v32, r64	C FIXME not urgent
197	faddd	p32, r32, a32
198	ld	[%i1+%i2], %f5		C read high 32 bits of up[i]
199	fdtox	a00, a00
200	faddd	p48, r48, a48
201	fmuld	u32, v48, r80	C FIXME not urgent
202	fdtox	a16, a16
203	fdtox	a32, a32
204	fxtod	%f2, u00
205	fxtod	%f4, u32
206	fdtox	a48, a48
207	std	a00, [%sp+2223+0]
208	fmuld	u00, v00, p00
209	std	a16, [%sp+2223+8]
210	fmuld	u00, v16, p16
211	std	a32, [%sp+2223+16]
212	fmuld	u00, v32, p32
213	std	a48, [%sp+2223+24]
214	faddd	p00, r64, a00
215	fmuld	u32, v00, r32
216	faddd	p16, r80, a16
217	fmuld	u00, v48, p48
218	addcc	%i2, 8, %i2
219	bnz,pt	%xcc, .L_three_or_more
220	fmuld	u32, v16, r48
221
222.L_two:
223	fmuld	u32, v32, r64	C FIXME not urgent
224	faddd	p32, r32, a32
225	fdtox	a00, a00
226	ldx	[%i0+%i2], rlimb	C read rp[i]
227	faddd	p48, r48, a48
228	fmuld	u32, v48, r80	C FIXME not urgent
229	fdtox	a16, a16
230	ldx	[%sp+2223+0], i00
231	fdtox	a32, a32
232	ldx	[%sp+2223+8], i16
233	ldx	[%sp+2223+16], i32
234	ldx	[%sp+2223+24], i48
235	fdtox	a48, a48
236	std	a00, [%sp+2223+0]
237	std	a16, [%sp+2223+8]
238	std	a32, [%sp+2223+16]
239	std	a48, [%sp+2223+24]
240	add	%i2, 8, %i2
241
242	fdtox	r64, a00
243	srlx	rlimb, 32, %g4		C HI(rlimb)
244	and	rlimb, xffffffff, %g5	C LO(rlimb)
245	ldx	[%i0+%i2], rlimb	C read rp[i]
246	add	i00, %g5, %g5		C i00+ now in g5
247	fdtox	r80, a16
248	ldx	[%sp+2223+0], i00
249	srlx	i16, 48, %l4		C (i16 >> 48)
250	mov	i16, %g2
251	ldx	[%sp+2223+8], i16
252	srlx	i48, 16, %l5		C (i48 >> 16)
253	add	i32, %g4, %g4		C i32+ now in g4
254	ldx	[%sp+2223+16], i32
255	sllx	i48, 32, %l6		C (i48 << 32)
256	ldx	[%sp+2223+24], i48
257	srlx	%g4, 32, %o3		C (i32 >> 32)
258	add	%l5, %l4, %o1		C hi64- in %o1
259	std	a00, [%sp+2223+0]
260	sllx	%g4, 16, %o2		C (i32 << 16)
261	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT
262	std	a16, [%sp+2223+8]
263	sllx	%o1, 48, %o3		C (hi64 << 48)
264	add	%g2, %o2, %o2		C mi64- in %o2
265	add	%l6, %o2, %o2		C mi64- in %o2
266	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT
267	add	cy, %g5, %o4		C x = prev(i00) + cy
268	b	.L_out_2
269	add	%i2, 8, %i2
270
271.L_three_or_more:
272	ld	[%i5+%i2], %f3		C read low 32 bits of up[i]
273	fmuld	u32, v32, r64	C FIXME not urgent
274	faddd	p32, r32, a32
275	ld	[%i1+%i2], %f5		C read high 32 bits of up[i]
276	fdtox	a00, a00
277	ldx	[%i0+%i2], rlimb	C read rp[i]
278	faddd	p48, r48, a48
279	fmuld	u32, v48, r80	C FIXME not urgent
280	fdtox	a16, a16
281	ldx	[%sp+2223+0], i00
282	fdtox	a32, a32
283	ldx	[%sp+2223+8], i16
284	fxtod	%f2, u00
285	ldx	[%sp+2223+16], i32
286	fxtod	%f4, u32
287	ldx	[%sp+2223+24], i48
288	fdtox	a48, a48
289	std	a00, [%sp+2223+0]
290	fmuld	u00, v00, p00
291	std	a16, [%sp+2223+8]
292	fmuld	u00, v16, p16
293	std	a32, [%sp+2223+16]
294	fmuld	u00, v32, p32
295	std	a48, [%sp+2223+24]
296	faddd	p00, r64, a00
297	fmuld	u32, v00, r32
298	faddd	p16, r80, a16
299	fmuld	u00, v48, p48
300	addcc	%i2, 8, %i2
301	bnz,pt	%xcc, .L_four_or_more
302	fmuld	u32, v16, r48
303
304.L_three:
305	fmuld	u32, v32, r64	C FIXME not urgent
306	faddd	p32, r32, a32
307	fdtox	a00, a00
308	srlx	rlimb, 32, %g4		C HI(rlimb)
309	and	rlimb, xffffffff, %g5	C LO(rlimb)
310	ldx	[%i0+%i2], rlimb	C read rp[i]
311	faddd	p48, r48, a48
312	add	i00, %g5, %g5		C i00+ now in g5
313	fmuld	u32, v48, r80	C FIXME not urgent
314	fdtox	a16, a16
315	ldx	[%sp+2223+0], i00
316	fdtox	a32, a32
317	srlx	i16, 48, %l4		C (i16 >> 48)
318	mov	i16, %g2
319	ldx	[%sp+2223+8], i16
320	srlx	i48, 16, %l5		C (i48 >> 16)
321	add	i32, %g4, %g4		C i32+ now in g4
322	ldx	[%sp+2223+16], i32
323	sllx	i48, 32, %l6		C (i48 << 32)
324	ldx	[%sp+2223+24], i48
325	fdtox	a48, a48
326	srlx	%g4, 32, %o3		C (i32 >> 32)
327	add	%l5, %l4, %o1		C hi64- in %o1
328	std	a00, [%sp+2223+0]
329	sllx	%g4, 16, %o2		C (i32 << 16)
330	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT
331	std	a16, [%sp+2223+8]
332	sllx	%o1, 48, %o3		C (hi64 << 48)
333	add	%g2, %o2, %o2		C mi64- in %o2
334	std	a32, [%sp+2223+16]
335	add	%l6, %o2, %o2		C mi64- in %o2
336	std	a48, [%sp+2223+24]
337	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT
338	add	cy, %g5, %o4		C x = prev(i00) + cy
339	b	.L_out_3
340	add	%i2, 8, %i2
341
342.L_four_or_more:
343	ld	[%i5+%i2], %f3		C read low 32 bits of up[i]
344	fmuld	u32, v32, r64	C FIXME not urgent
345	faddd	p32, r32, a32
346	ld	[%i1+%i2], %f5		C read high 32 bits of up[i]
347	fdtox	a00, a00
348	srlx	rlimb, 32, %g4		C HI(rlimb)
349	and	rlimb, xffffffff, %g5	C LO(rlimb)
350	ldx	[%i0+%i2], rlimb	C read rp[i]
351	faddd	p48, r48, a48
352	add	i00, %g5, %g5		C i00+ now in g5
353	fmuld	u32, v48, r80	C FIXME not urgent
354	fdtox	a16, a16
355	ldx	[%sp+2223+0], i00
356	fdtox	a32, a32
357	srlx	i16, 48, %l4		C (i16 >> 48)
358	mov	i16, %g2
359	ldx	[%sp+2223+8], i16
360	fxtod	%f2, u00
361	srlx	i48, 16, %l5		C (i48 >> 16)
362	add	i32, %g4, %g4		C i32+ now in g4
363	ldx	[%sp+2223+16], i32
364	fxtod	%f4, u32
365	sllx	i48, 32, %l6		C (i48 << 32)
366	ldx	[%sp+2223+24], i48
367	fdtox	a48, a48
368	srlx	%g4, 32, %o3		C (i32 >> 32)
369	add	%l5, %l4, %o1		C hi64- in %o1
370	std	a00, [%sp+2223+0]
371	fmuld	u00, v00, p00
372	sllx	%g4, 16, %o2		C (i32 << 16)
373	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT
374	std	a16, [%sp+2223+8]
375	fmuld	u00, v16, p16
376	sllx	%o1, 48, %o3		C (hi64 << 48)
377	add	%g2, %o2, %o2		C mi64- in %o2
378	std	a32, [%sp+2223+16]
379	fmuld	u00, v32, p32
380	add	%l6, %o2, %o2		C mi64- in %o2
381	std	a48, [%sp+2223+24]
382	faddd	p00, r64, a00
383	fmuld	u32, v00, r32
384	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT
385	faddd	p16, r80, a16
386	fmuld	u00, v48, p48
387	add	cy, %g5, %o4		C x = prev(i00) + cy
388	addcc	%i2, 8, %i2
389	bnz,pt	%xcc, .Loop
390	fmuld	u32, v16, r48
391
392.L_four:
393	b,a	.L_out_4
394
395C BEGIN MAIN LOOP
396	.align	16
397.Loop:
398C 00
399	srlx	%o4, 16, %o5		C (x >> 16)
400	ld	[%i5+%i2], %f3		C read low 32 bits of up[i]
401	fmuld	u32, v32, r64	C FIXME not urgent
402	faddd	p32, r32, a32
403C 01
404	add	%o5, %o2, %o2		C mi64 in %o2   2nd ASSIGNMENT
405	and	%o4, xffff, %o5		C (x & 0xffff)
406	ld	[%i1+%i2], %f5		C read high 32 bits of up[i]
407	fdtox	a00, a00
408C 02
409	srlx	rlimb, 32, %g4		C HI(rlimb)
410	and	rlimb, xffffffff, %g5	C LO(rlimb)
411	ldx	[%i0+%i2], rlimb	C read rp[i]
412	faddd	p48, r48, a48
413C 03
414	srlx	%o2, 48, %o7		C (mi64 >> 48)
415	add	i00, %g5, %g5		C i00+ now in g5
416	fmuld	u32, v48, r80	C FIXME not urgent
417	fdtox	a16, a16
418C 04
419	sllx	%o2, 16, %i3		C (mi64 << 16)
420	add	%o7, %o1, cy		C new cy
421	ldx	[%sp+2223+0], i00
422	fdtox	a32, a32
423C 05
424	srlx	i16, 48, %l4		C (i16 >> 48)
425	mov	i16, %g2
426	ldx	[%sp+2223+8], i16
427	fxtod	%f2, u00
428C 06
429	srlx	i48, 16, %l5		C (i48 >> 16)
430	add	i32, %g4, %g4		C i32+ now in g4
431	ldx	[%sp+2223+16], i32
432	fxtod	%f4, u32
433C 07
434	sllx	i48, 32, %l6		C (i48 << 32)
435	or	%i3, %o5, %o5
436	ldx	[%sp+2223+24], i48
437	fdtox	a48, a48
438C 08
439	srlx	%g4, 32, %o3		C (i32 >> 32)
440	add	%l5, %l4, %o1		C hi64- in %o1
441	std	a00, [%sp+2223+0]
442	fmuld	u00, v00, p00
443C 09
444	sllx	%g4, 16, %o2		C (i32 << 16)
445	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT
446	std	a16, [%sp+2223+8]
447	fmuld	u00, v16, p16
448C 10
449	sllx	%o1, 48, %o3		C (hi64 << 48)
450	add	%g2, %o2, %o2		C mi64- in %o2
451	std	a32, [%sp+2223+16]
452	fmuld	u00, v32, p32
453C 11
454	add	%l6, %o2, %o2		C mi64- in %o2
455	std	a48, [%sp+2223+24]
456	faddd	p00, r64, a00
457	fmuld	u32, v00, r32
458C 12
459	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT
460	stx	%o5, [%i4+%i2]
461	faddd	p16, r80, a16
462	fmuld	u00, v48, p48
463C 13
464	add	cy, %g5, %o4		C x = prev(i00) + cy
465	addcc	%i2, 8, %i2
466	bnz,pt	%xcc, .Loop
467	fmuld	u32, v16, r48
468C END MAIN LOOP
469
470.L_out_4:
471	srlx	%o4, 16, %o5		C (x >> 16)
472	fmuld	u32, v32, r64	C FIXME not urgent
473	faddd	p32, r32, a32
474	add	%o5, %o2, %o2		C mi64 in %o2   2nd ASSIGNMENT
475	and	%o4, xffff, %o5		C (x & 0xffff)
476	fdtox	a00, a00
477	srlx	rlimb, 32, %g4		C HI(rlimb)
478	and	rlimb, xffffffff, %g5	C LO(rlimb)
479	ldx	[%i0+%i2], rlimb	C read rp[i]
480	faddd	p48, r48, a48
481	srlx	%o2, 48, %o7		C (mi64 >> 48)
482	add	i00, %g5, %g5		C i00+ now in g5
483	fmuld	u32, v48, r80	C FIXME not urgent
484	fdtox	a16, a16
485	sllx	%o2, 16, %i3		C (mi64 << 16)
486	add	%o7, %o1, cy		C new cy
487	ldx	[%sp+2223+0], i00
488	fdtox	a32, a32
489	srlx	i16, 48, %l4		C (i16 >> 48)
490	mov	i16, %g2
491	ldx	[%sp+2223+8], i16
492	srlx	i48, 16, %l5		C (i48 >> 16)
493	add	i32, %g4, %g4		C i32+ now in g4
494	ldx	[%sp+2223+16], i32
495	sllx	i48, 32, %l6		C (i48 << 32)
496	or	%i3, %o5, %o5
497	ldx	[%sp+2223+24], i48
498	fdtox	a48, a48
499	srlx	%g4, 32, %o3		C (i32 >> 32)
500	add	%l5, %l4, %o1		C hi64- in %o1
501	std	a00, [%sp+2223+0]
502	sllx	%g4, 16, %o2		C (i32 << 16)
503	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT
504	std	a16, [%sp+2223+8]
505	sllx	%o1, 48, %o3		C (hi64 << 48)
506	add	%g2, %o2, %o2		C mi64- in %o2
507	std	a32, [%sp+2223+16]
508	add	%l6, %o2, %o2		C mi64- in %o2
509	std	a48, [%sp+2223+24]
510	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT
511	stx	%o5, [%i4+%i2]
512	add	cy, %g5, %o4		C x = prev(i00) + cy
513	add	%i2, 8, %i2
514.L_out_3:
515	srlx	%o4, 16, %o5		C (x >> 16)
516	add	%o5, %o2, %o2		C mi64 in %o2   2nd ASSIGNMENT
517	and	%o4, xffff, %o5		C (x & 0xffff)
518	fdtox	r64, a00
519	srlx	rlimb, 32, %g4		C HI(rlimb)
520	and	rlimb, xffffffff, %g5	C LO(rlimb)
521	ldx	[%i0+%i2], rlimb	C read rp[i]
522	srlx	%o2, 48, %o7		C (mi64 >> 48)
523	add	i00, %g5, %g5		C i00+ now in g5
524	fdtox	r80, a16
525	sllx	%o2, 16, %i3		C (mi64 << 16)
526	add	%o7, %o1, cy		C new cy
527	ldx	[%sp+2223+0], i00
528	srlx	i16, 48, %l4		C (i16 >> 48)
529	mov	i16, %g2
530	ldx	[%sp+2223+8], i16
531	srlx	i48, 16, %l5		C (i48 >> 16)
532	add	i32, %g4, %g4		C i32+ now in g4
533	ldx	[%sp+2223+16], i32
534	sllx	i48, 32, %l6		C (i48 << 32)
535	or	%i3, %o5, %o5
536	ldx	[%sp+2223+24], i48
537	srlx	%g4, 32, %o3		C (i32 >> 32)
538	add	%l5, %l4, %o1		C hi64- in %o1
539	std	a00, [%sp+2223+0]
540	sllx	%g4, 16, %o2		C (i32 << 16)
541	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT
542	std	a16, [%sp+2223+8]
543	sllx	%o1, 48, %o3		C (hi64 << 48)
544	add	%g2, %o2, %o2		C mi64- in %o2
545	add	%l6, %o2, %o2		C mi64- in %o2
546	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT
547	stx	%o5, [%i4+%i2]
548	add	cy, %g5, %o4		C x = prev(i00) + cy
549	add	%i2, 8, %i2
550.L_out_2:
551	srlx	%o4, 16, %o5		C (x >> 16)
552	add	%o5, %o2, %o2		C mi64 in %o2   2nd ASSIGNMENT
553	and	%o4, xffff, %o5		C (x & 0xffff)
554	srlx	rlimb, 32, %g4		C HI(rlimb)
555	and	rlimb, xffffffff, %g5	C LO(rlimb)
556	srlx	%o2, 48, %o7		C (mi64 >> 48)
557	add	i00, %g5, %g5		C i00+ now in g5
558	sllx	%o2, 16, %i3		C (mi64 << 16)
559	add	%o7, %o1, cy		C new cy
560	ldx	[%sp+2223+0], i00
561	srlx	i16, 48, %l4		C (i16 >> 48)
562	mov	i16, %g2
563	ldx	[%sp+2223+8], i16
564	srlx	i48, 16, %l5		C (i48 >> 16)
565	add	i32, %g4, %g4		C i32+ now in g4
566	sllx	i48, 32, %l6		C (i48 << 32)
567	or	%i3, %o5, %o5
568	srlx	%g4, 32, %o3		C (i32 >> 32)
569	add	%l5, %l4, %o1		C hi64- in %o1
570	sllx	%g4, 16, %o2		C (i32 << 16)
571	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT
572	sllx	%o1, 48, %o3		C (hi64 << 48)
573	add	%g2, %o2, %o2		C mi64- in %o2
574	add	%l6, %o2, %o2		C mi64- in %o2
575	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT
576	stx	%o5, [%i4+%i2]
577	add	cy, %g5, %o4		C x = prev(i00) + cy
578	add	%i2, 8, %i2
579.L_out_1:
580	srlx	%o4, 16, %o5		C (x >> 16)
581	add	%o5, %o2, %o2		C mi64 in %o2   2nd ASSIGNMENT
582	and	%o4, xffff, %o5		C (x & 0xffff)
583	srlx	%o2, 48, %o7		C (mi64 >> 48)
584	sllx	%o2, 16, %i3		C (mi64 << 16)
585	add	%o7, %o1, cy		C new cy
586	or	%i3, %o5, %o5
587	stx	%o5, [%i4+%i2]
588
589	sllx	i00, 0, %g2
590	add	%g2, cy, cy
591	sllx	i16, 16, %g3
592	add	%g3, cy, cy
593
594	return	%i7+8
595	mov	cy, %o0
596EPILOGUE(mpn_addmul_1)
597