xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/sparc64/ultrasparc1234/addmul_1.asm (revision e6c7e151de239c49d2e38720a061ed9d1fa99309)
1dnl  SPARC v9 64-bit mpn_addmul_1 -- Multiply a limb vector with a limb and add
2dnl  the result to a second limb vector.
3
4dnl  Copyright 1998, 2000-2004 Free Software Foundation, Inc.
5
6dnl  This file is part of the GNU MP Library.
7dnl
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of either:
10dnl
11dnl    * the GNU Lesser General Public License as published by the Free
12dnl      Software Foundation; either version 3 of the License, or (at your
13dnl      option) any later version.
14dnl
15dnl  or
16dnl
17dnl    * the GNU General Public License as published by the Free Software
18dnl      Foundation; either version 2 of the License, or (at your option) any
19dnl      later version.
20dnl
21dnl  or both in parallel, as here.
22dnl
23dnl  The GNU MP Library is distributed in the hope that it will be useful, but
24dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
26dnl  for more details.
27dnl
28dnl  You should have received copies of the GNU General Public License and the
29dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
30dnl  see https://www.gnu.org/licenses/.
31
32include(`../config.m4')
33
34C		   cycles/limb
35C UltraSPARC 1&2:     14
36C UltraSPARC 3:	      17.5
37
38C Algorithm: We use eight floating-point multiplies per limb product, with the
39C invariant v operand split into four 16-bit pieces, and the up operand split
40C into 32-bit pieces.  We sum pairs of 48-bit partial products using
41C floating-point add, then convert the four 49-bit product-sums and transfer
42C them to the integer unit.
43
44C Possible optimizations:
45C   0. Rewrite to use algorithm of mpn_addmul_2.
46C   1. Align the stack area where we transfer the four 49-bit product-sums
47C      to a 32-byte boundary.  That would minimize the cache collision.
48C      (UltraSPARC-1/2 use a direct-mapped cache.)  (Perhaps even better would
49C      be to align the area to map to the area immediately before up?)
50C   2. Sum the 4 49-bit quantities using 32-bit operations, as in the
51C      develop mpn_addmul_2.  This would save many integer instructions.
52C   3. Unrolling.  Questionable if it is worth the code expansion, given that
53C      it could only save 1 cycle/limb.
54C   4. Specialize for particular v values.  If its upper 32 bits are zero, we
55C      could save many operations, in the FPU (fmuld), but more so in the IEU
56C      since we'll be summing 48-bit quantities, which might be simpler.
57C   5. Ideally, we should schedule the f2/f3 and f4/f5 RAW further apart, and
58C      the i00,i16,i32,i48 RAW less apart.  The latter apart-scheduling should
59C      not be greater than needed for L2 cache latency, and also not so great
60C      that i16 needs to be copied.
61C   6. Avoid performing mem+fa+fm in the same cycle, at least not when we want
62C      to get high IEU bandwidth.  (12 of the 14 cycles will be free for 2 IEU
63C      ops.)
64
65C Instruction classification (as per UltraSPARC-1/2 functional units):
66C    8 FM
67C   10 FA
68C   12 MEM
69C   10 ISHIFT + 14 IADDLOG
70C    1 BRANCH
71C   55 insns totally (plus one mov insn that should be optimized out)
72
73C The loop executes 56 instructions in 14 cycles on UltraSPARC-1/2, i.e we
74C sustain the peak execution rate of 4 instructions/cycle.
75
76C INPUT PARAMETERS
77C rp	i0
78C up	i1
79C n	i2
80C v	i3
81
82ASM_START()
83	REGISTER(%g2,#scratch)
84	REGISTER(%g3,#scratch)
85
86define(`p00', `%f8') define(`p16',`%f10') define(`p32',`%f12') define(`p48',`%f14')
87define(`r32',`%f16') define(`r48',`%f18') define(`r64',`%f20') define(`r80',`%f22')
88define(`v00',`%f24') define(`v16',`%f26') define(`v32',`%f28') define(`v48',`%f30')
89define(`u00',`%f32') define(`u32', `%f34')
90define(`a00',`%f36') define(`a16',`%f38') define(`a32',`%f40') define(`a48',`%f42')
91define(`cy',`%g1')
92define(`rlimb',`%g3')
93define(`i00',`%l0') define(`i16',`%l1') define(`i32',`%l2') define(`i48',`%l3')
94define(`xffffffff',`%l7')
95define(`xffff',`%o0')
96
97PROLOGUE(mpn_addmul_1)
98
99C Initialization.  (1) Split v operand into four 16-bit chunks and store them
100C as IEEE double in fp registers.  (2) Clear upper 32 bits of fp register pairs
101C f2 and f4.  (3) Store masks in registers aliased to `xffff' and `xffffffff'.
102
103	save	%sp, -256, %sp
104	mov	-1, %g4
105	srlx	%g4, 48, xffff		C store mask in register `xffff'
106	and	%i3, xffff, %g2
107	stx	%g2, [%sp+2223+0]
108	srlx	%i3, 16, %g3
109	and	%g3, xffff, %g3
110	stx	%g3, [%sp+2223+8]
111	srlx	%i3, 32, %g2
112	and	%g2, xffff, %g2
113	stx	%g2, [%sp+2223+16]
114	srlx	%i3, 48, %g3
115	stx	%g3, [%sp+2223+24]
116	srlx	%g4, 32, xffffffff	C store mask in register `xffffffff'
117
118	sllx	%i2, 3, %i2
119	mov	0, cy			C clear cy
120	add	%i0, %i2, %i0
121	add	%i1, %i2, %i1
122	neg	%i2
123	add	%i1, 4, %i5
124	add	%i0, -32, %i4
125	add	%i0, -16, %i0
126
127	ldd	[%sp+2223+0], v00
128	ldd	[%sp+2223+8], v16
129	ldd	[%sp+2223+16], v32
130	ldd	[%sp+2223+24], v48
131	ld	[%sp+2223+0],%f2	C zero f2
132	ld	[%sp+2223+0],%f4	C zero f4
133	ld	[%i5+%i2], %f3		C read low 32 bits of up[i]
134	ld	[%i1+%i2], %f5		C read high 32 bits of up[i]
135	fxtod	v00, v00
136	fxtod	v16, v16
137	fxtod	v32, v32
138	fxtod	v48, v48
139
140C Start real work.  (We sneakingly read f3 and f5 above...)
141C The software pipeline is very deep, requiring 4 feed-in stages.
142
143	fxtod	%f2, u00
144	fxtod	%f4, u32
145	fmuld	u00, v00, a00
146	fmuld	u00, v16, a16
147	fmuld	u00, v32, p32
148	fmuld	u32, v00, r32
149	fmuld	u00, v48, p48
150	addcc	%i2, 8, %i2
151	bnz,pt	%xcc, .L_two_or_more
152	fmuld	u32, v16, r48
153
154.L_one:
155	fmuld	u32, v32, r64	C FIXME not urgent
156	faddd	p32, r32, a32
157	fdtox	a00, a00
158	faddd	p48, r48, a48
159	fmuld	u32, v48, r80	C FIXME not urgent
160	fdtox	a16, a16
161	fdtox	a32, a32
162	fdtox	a48, a48
163	std	a00, [%sp+2223+0]
164	std	a16, [%sp+2223+8]
165	std	a32, [%sp+2223+16]
166	std	a48, [%sp+2223+24]
167	add	%i2, 8, %i2
168
169	fdtox	r64, a00
170	ldx	[%i0+%i2], rlimb	C read rp[i]
171	fdtox	r80, a16
172	ldx	[%sp+2223+0], i00
173	ldx	[%sp+2223+8], i16
174	ldx	[%sp+2223+16], i32
175	ldx	[%sp+2223+24], i48
176	std	a00, [%sp+2223+0]
177	std	a16, [%sp+2223+8]
178	add	%i2, 8, %i2
179
180	srlx	rlimb, 32, %g4		C HI(rlimb)
181	and	rlimb, xffffffff, %g5	C LO(rlimb)
182	add	i00, %g5, %g5		C i00+ now in g5
183	ldx	[%sp+2223+0], i00
184	srlx	i16, 48, %l4		C (i16 >> 48)
185	mov	i16, %g2
186	ldx	[%sp+2223+8], i16
187	srlx	i48, 16, %l5		C (i48 >> 16)
188	add	i32, %g4, %g4		C i32+ now in g4
189	sllx	i48, 32, %l6		C (i48 << 32)
190	srlx	%g4, 32, %o3		C (i32 >> 32)
191	add	%l5, %l4, %o1		C hi64- in %o1
192	std	a00, [%sp+2223+0]
193	sllx	%g4, 16, %o2		C (i32 << 16)
194	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT
195	std	a16, [%sp+2223+8]
196	sllx	%o1, 48, %o3		C (hi64 << 48)
197	add	%g2, %o2, %o2		C mi64- in %o2
198	add	%l6, %o2, %o2		C mi64- in %o2
199	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT
200	add	cy, %g5, %o4		C x = prev(i00) + cy
201	b	.L_out_1
202	add	%i2, 8, %i2
203
204.L_two_or_more:
205	ld	[%i5+%i2], %f3		C read low 32 bits of up[i]
206	fmuld	u32, v32, r64	C FIXME not urgent
207	faddd	p32, r32, a32
208	ld	[%i1+%i2], %f5		C read high 32 bits of up[i]
209	fdtox	a00, a00
210	faddd	p48, r48, a48
211	fmuld	u32, v48, r80	C FIXME not urgent
212	fdtox	a16, a16
213	fdtox	a32, a32
214	fxtod	%f2, u00
215	fxtod	%f4, u32
216	fdtox	a48, a48
217	std	a00, [%sp+2223+0]
218	fmuld	u00, v00, p00
219	std	a16, [%sp+2223+8]
220	fmuld	u00, v16, p16
221	std	a32, [%sp+2223+16]
222	fmuld	u00, v32, p32
223	std	a48, [%sp+2223+24]
224	faddd	p00, r64, a00
225	fmuld	u32, v00, r32
226	faddd	p16, r80, a16
227	fmuld	u00, v48, p48
228	addcc	%i2, 8, %i2
229	bnz,pt	%xcc, .L_three_or_more
230	fmuld	u32, v16, r48
231
232.L_two:
233	fmuld	u32, v32, r64	C FIXME not urgent
234	faddd	p32, r32, a32
235	fdtox	a00, a00
236	ldx	[%i0+%i2], rlimb	C read rp[i]
237	faddd	p48, r48, a48
238	fmuld	u32, v48, r80	C FIXME not urgent
239	fdtox	a16, a16
240	ldx	[%sp+2223+0], i00
241	fdtox	a32, a32
242	ldx	[%sp+2223+8], i16
243	ldx	[%sp+2223+16], i32
244	ldx	[%sp+2223+24], i48
245	fdtox	a48, a48
246	std	a00, [%sp+2223+0]
247	std	a16, [%sp+2223+8]
248	std	a32, [%sp+2223+16]
249	std	a48, [%sp+2223+24]
250	add	%i2, 8, %i2
251
252	fdtox	r64, a00
253	srlx	rlimb, 32, %g4		C HI(rlimb)
254	and	rlimb, xffffffff, %g5	C LO(rlimb)
255	ldx	[%i0+%i2], rlimb	C read rp[i]
256	add	i00, %g5, %g5		C i00+ now in g5
257	fdtox	r80, a16
258	ldx	[%sp+2223+0], i00
259	srlx	i16, 48, %l4		C (i16 >> 48)
260	mov	i16, %g2
261	ldx	[%sp+2223+8], i16
262	srlx	i48, 16, %l5		C (i48 >> 16)
263	add	i32, %g4, %g4		C i32+ now in g4
264	ldx	[%sp+2223+16], i32
265	sllx	i48, 32, %l6		C (i48 << 32)
266	ldx	[%sp+2223+24], i48
267	srlx	%g4, 32, %o3		C (i32 >> 32)
268	add	%l5, %l4, %o1		C hi64- in %o1
269	std	a00, [%sp+2223+0]
270	sllx	%g4, 16, %o2		C (i32 << 16)
271	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT
272	std	a16, [%sp+2223+8]
273	sllx	%o1, 48, %o3		C (hi64 << 48)
274	add	%g2, %o2, %o2		C mi64- in %o2
275	add	%l6, %o2, %o2		C mi64- in %o2
276	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT
277	add	cy, %g5, %o4		C x = prev(i00) + cy
278	b	.L_out_2
279	add	%i2, 8, %i2
280
281.L_three_or_more:
282	ld	[%i5+%i2], %f3		C read low 32 bits of up[i]
283	fmuld	u32, v32, r64	C FIXME not urgent
284	faddd	p32, r32, a32
285	ld	[%i1+%i2], %f5		C read high 32 bits of up[i]
286	fdtox	a00, a00
287	ldx	[%i0+%i2], rlimb	C read rp[i]
288	faddd	p48, r48, a48
289	fmuld	u32, v48, r80	C FIXME not urgent
290	fdtox	a16, a16
291	ldx	[%sp+2223+0], i00
292	fdtox	a32, a32
293	ldx	[%sp+2223+8], i16
294	fxtod	%f2, u00
295	ldx	[%sp+2223+16], i32
296	fxtod	%f4, u32
297	ldx	[%sp+2223+24], i48
298	fdtox	a48, a48
299	std	a00, [%sp+2223+0]
300	fmuld	u00, v00, p00
301	std	a16, [%sp+2223+8]
302	fmuld	u00, v16, p16
303	std	a32, [%sp+2223+16]
304	fmuld	u00, v32, p32
305	std	a48, [%sp+2223+24]
306	faddd	p00, r64, a00
307	fmuld	u32, v00, r32
308	faddd	p16, r80, a16
309	fmuld	u00, v48, p48
310	addcc	%i2, 8, %i2
311	bnz,pt	%xcc, .L_four_or_more
312	fmuld	u32, v16, r48
313
314.L_three:
315	fmuld	u32, v32, r64	C FIXME not urgent
316	faddd	p32, r32, a32
317	fdtox	a00, a00
318	srlx	rlimb, 32, %g4		C HI(rlimb)
319	and	rlimb, xffffffff, %g5	C LO(rlimb)
320	ldx	[%i0+%i2], rlimb	C read rp[i]
321	faddd	p48, r48, a48
322	add	i00, %g5, %g5		C i00+ now in g5
323	fmuld	u32, v48, r80	C FIXME not urgent
324	fdtox	a16, a16
325	ldx	[%sp+2223+0], i00
326	fdtox	a32, a32
327	srlx	i16, 48, %l4		C (i16 >> 48)
328	mov	i16, %g2
329	ldx	[%sp+2223+8], i16
330	srlx	i48, 16, %l5		C (i48 >> 16)
331	add	i32, %g4, %g4		C i32+ now in g4
332	ldx	[%sp+2223+16], i32
333	sllx	i48, 32, %l6		C (i48 << 32)
334	ldx	[%sp+2223+24], i48
335	fdtox	a48, a48
336	srlx	%g4, 32, %o3		C (i32 >> 32)
337	add	%l5, %l4, %o1		C hi64- in %o1
338	std	a00, [%sp+2223+0]
339	sllx	%g4, 16, %o2		C (i32 << 16)
340	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT
341	std	a16, [%sp+2223+8]
342	sllx	%o1, 48, %o3		C (hi64 << 48)
343	add	%g2, %o2, %o2		C mi64- in %o2
344	std	a32, [%sp+2223+16]
345	add	%l6, %o2, %o2		C mi64- in %o2
346	std	a48, [%sp+2223+24]
347	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT
348	add	cy, %g5, %o4		C x = prev(i00) + cy
349	b	.L_out_3
350	add	%i2, 8, %i2
351
352.L_four_or_more:
353	ld	[%i5+%i2], %f3		C read low 32 bits of up[i]
354	fmuld	u32, v32, r64	C FIXME not urgent
355	faddd	p32, r32, a32
356	ld	[%i1+%i2], %f5		C read high 32 bits of up[i]
357	fdtox	a00, a00
358	srlx	rlimb, 32, %g4		C HI(rlimb)
359	and	rlimb, xffffffff, %g5	C LO(rlimb)
360	ldx	[%i0+%i2], rlimb	C read rp[i]
361	faddd	p48, r48, a48
362	add	i00, %g5, %g5		C i00+ now in g5
363	fmuld	u32, v48, r80	C FIXME not urgent
364	fdtox	a16, a16
365	ldx	[%sp+2223+0], i00
366	fdtox	a32, a32
367	srlx	i16, 48, %l4		C (i16 >> 48)
368	mov	i16, %g2
369	ldx	[%sp+2223+8], i16
370	fxtod	%f2, u00
371	srlx	i48, 16, %l5		C (i48 >> 16)
372	add	i32, %g4, %g4		C i32+ now in g4
373	ldx	[%sp+2223+16], i32
374	fxtod	%f4, u32
375	sllx	i48, 32, %l6		C (i48 << 32)
376	ldx	[%sp+2223+24], i48
377	fdtox	a48, a48
378	srlx	%g4, 32, %o3		C (i32 >> 32)
379	add	%l5, %l4, %o1		C hi64- in %o1
380	std	a00, [%sp+2223+0]
381	fmuld	u00, v00, p00
382	sllx	%g4, 16, %o2		C (i32 << 16)
383	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT
384	std	a16, [%sp+2223+8]
385	fmuld	u00, v16, p16
386	sllx	%o1, 48, %o3		C (hi64 << 48)
387	add	%g2, %o2, %o2		C mi64- in %o2
388	std	a32, [%sp+2223+16]
389	fmuld	u00, v32, p32
390	add	%l6, %o2, %o2		C mi64- in %o2
391	std	a48, [%sp+2223+24]
392	faddd	p00, r64, a00
393	fmuld	u32, v00, r32
394	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT
395	faddd	p16, r80, a16
396	fmuld	u00, v48, p48
397	add	cy, %g5, %o4		C x = prev(i00) + cy
398	addcc	%i2, 8, %i2
399	bnz,pt	%xcc, .Loop
400	fmuld	u32, v16, r48
401
402.L_four:
403	b,a	.L_out_4
404
405C BEGIN MAIN LOOP
406	.align	16
407.Loop:
408C 00
409	srlx	%o4, 16, %o5		C (x >> 16)
410	ld	[%i5+%i2], %f3		C read low 32 bits of up[i]
411	fmuld	u32, v32, r64	C FIXME not urgent
412	faddd	p32, r32, a32
413C 01
414	add	%o5, %o2, %o2		C mi64 in %o2   2nd ASSIGNMENT
415	and	%o4, xffff, %o5		C (x & 0xffff)
416	ld	[%i1+%i2], %f5		C read high 32 bits of up[i]
417	fdtox	a00, a00
418C 02
419	srlx	rlimb, 32, %g4		C HI(rlimb)
420	and	rlimb, xffffffff, %g5	C LO(rlimb)
421	ldx	[%i0+%i2], rlimb	C read rp[i]
422	faddd	p48, r48, a48
423C 03
424	srlx	%o2, 48, %o7		C (mi64 >> 48)
425	add	i00, %g5, %g5		C i00+ now in g5
426	fmuld	u32, v48, r80	C FIXME not urgent
427	fdtox	a16, a16
428C 04
429	sllx	%o2, 16, %i3		C (mi64 << 16)
430	add	%o7, %o1, cy		C new cy
431	ldx	[%sp+2223+0], i00
432	fdtox	a32, a32
433C 05
434	srlx	i16, 48, %l4		C (i16 >> 48)
435	mov	i16, %g2
436	ldx	[%sp+2223+8], i16
437	fxtod	%f2, u00
438C 06
439	srlx	i48, 16, %l5		C (i48 >> 16)
440	add	i32, %g4, %g4		C i32+ now in g4
441	ldx	[%sp+2223+16], i32
442	fxtod	%f4, u32
443C 07
444	sllx	i48, 32, %l6		C (i48 << 32)
445	or	%i3, %o5, %o5
446	ldx	[%sp+2223+24], i48
447	fdtox	a48, a48
448C 08
449	srlx	%g4, 32, %o3		C (i32 >> 32)
450	add	%l5, %l4, %o1		C hi64- in %o1
451	std	a00, [%sp+2223+0]
452	fmuld	u00, v00, p00
453C 09
454	sllx	%g4, 16, %o2		C (i32 << 16)
455	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT
456	std	a16, [%sp+2223+8]
457	fmuld	u00, v16, p16
458C 10
459	sllx	%o1, 48, %o3		C (hi64 << 48)
460	add	%g2, %o2, %o2		C mi64- in %o2
461	std	a32, [%sp+2223+16]
462	fmuld	u00, v32, p32
463C 11
464	add	%l6, %o2, %o2		C mi64- in %o2
465	std	a48, [%sp+2223+24]
466	faddd	p00, r64, a00
467	fmuld	u32, v00, r32
468C 12
469	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT
470	stx	%o5, [%i4+%i2]
471	faddd	p16, r80, a16
472	fmuld	u00, v48, p48
473C 13
474	add	cy, %g5, %o4		C x = prev(i00) + cy
475	addcc	%i2, 8, %i2
476	bnz,pt	%xcc, .Loop
477	fmuld	u32, v16, r48
478C END MAIN LOOP
479
480.L_out_4:
481	srlx	%o4, 16, %o5		C (x >> 16)
482	fmuld	u32, v32, r64	C FIXME not urgent
483	faddd	p32, r32, a32
484	add	%o5, %o2, %o2		C mi64 in %o2   2nd ASSIGNMENT
485	and	%o4, xffff, %o5		C (x & 0xffff)
486	fdtox	a00, a00
487	srlx	rlimb, 32, %g4		C HI(rlimb)
488	and	rlimb, xffffffff, %g5	C LO(rlimb)
489	ldx	[%i0+%i2], rlimb	C read rp[i]
490	faddd	p48, r48, a48
491	srlx	%o2, 48, %o7		C (mi64 >> 48)
492	add	i00, %g5, %g5		C i00+ now in g5
493	fmuld	u32, v48, r80	C FIXME not urgent
494	fdtox	a16, a16
495	sllx	%o2, 16, %i3		C (mi64 << 16)
496	add	%o7, %o1, cy		C new cy
497	ldx	[%sp+2223+0], i00
498	fdtox	a32, a32
499	srlx	i16, 48, %l4		C (i16 >> 48)
500	mov	i16, %g2
501	ldx	[%sp+2223+8], i16
502	srlx	i48, 16, %l5		C (i48 >> 16)
503	add	i32, %g4, %g4		C i32+ now in g4
504	ldx	[%sp+2223+16], i32
505	sllx	i48, 32, %l6		C (i48 << 32)
506	or	%i3, %o5, %o5
507	ldx	[%sp+2223+24], i48
508	fdtox	a48, a48
509	srlx	%g4, 32, %o3		C (i32 >> 32)
510	add	%l5, %l4, %o1		C hi64- in %o1
511	std	a00, [%sp+2223+0]
512	sllx	%g4, 16, %o2		C (i32 << 16)
513	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT
514	std	a16, [%sp+2223+8]
515	sllx	%o1, 48, %o3		C (hi64 << 48)
516	add	%g2, %o2, %o2		C mi64- in %o2
517	std	a32, [%sp+2223+16]
518	add	%l6, %o2, %o2		C mi64- in %o2
519	std	a48, [%sp+2223+24]
520	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT
521	stx	%o5, [%i4+%i2]
522	add	cy, %g5, %o4		C x = prev(i00) + cy
523	add	%i2, 8, %i2
524.L_out_3:
525	srlx	%o4, 16, %o5		C (x >> 16)
526	add	%o5, %o2, %o2		C mi64 in %o2   2nd ASSIGNMENT
527	and	%o4, xffff, %o5		C (x & 0xffff)
528	fdtox	r64, a00
529	srlx	rlimb, 32, %g4		C HI(rlimb)
530	and	rlimb, xffffffff, %g5	C LO(rlimb)
531	ldx	[%i0+%i2], rlimb	C read rp[i]
532	srlx	%o2, 48, %o7		C (mi64 >> 48)
533	add	i00, %g5, %g5		C i00+ now in g5
534	fdtox	r80, a16
535	sllx	%o2, 16, %i3		C (mi64 << 16)
536	add	%o7, %o1, cy		C new cy
537	ldx	[%sp+2223+0], i00
538	srlx	i16, 48, %l4		C (i16 >> 48)
539	mov	i16, %g2
540	ldx	[%sp+2223+8], i16
541	srlx	i48, 16, %l5		C (i48 >> 16)
542	add	i32, %g4, %g4		C i32+ now in g4
543	ldx	[%sp+2223+16], i32
544	sllx	i48, 32, %l6		C (i48 << 32)
545	or	%i3, %o5, %o5
546	ldx	[%sp+2223+24], i48
547	srlx	%g4, 32, %o3		C (i32 >> 32)
548	add	%l5, %l4, %o1		C hi64- in %o1
549	std	a00, [%sp+2223+0]
550	sllx	%g4, 16, %o2		C (i32 << 16)
551	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT
552	std	a16, [%sp+2223+8]
553	sllx	%o1, 48, %o3		C (hi64 << 48)
554	add	%g2, %o2, %o2		C mi64- in %o2
555	add	%l6, %o2, %o2		C mi64- in %o2
556	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT
557	stx	%o5, [%i4+%i2]
558	add	cy, %g5, %o4		C x = prev(i00) + cy
559	add	%i2, 8, %i2
560.L_out_2:
561	srlx	%o4, 16, %o5		C (x >> 16)
562	add	%o5, %o2, %o2		C mi64 in %o2   2nd ASSIGNMENT
563	and	%o4, xffff, %o5		C (x & 0xffff)
564	srlx	rlimb, 32, %g4		C HI(rlimb)
565	and	rlimb, xffffffff, %g5	C LO(rlimb)
566	srlx	%o2, 48, %o7		C (mi64 >> 48)
567	add	i00, %g5, %g5		C i00+ now in g5
568	sllx	%o2, 16, %i3		C (mi64 << 16)
569	add	%o7, %o1, cy		C new cy
570	ldx	[%sp+2223+0], i00
571	srlx	i16, 48, %l4		C (i16 >> 48)
572	mov	i16, %g2
573	ldx	[%sp+2223+8], i16
574	srlx	i48, 16, %l5		C (i48 >> 16)
575	add	i32, %g4, %g4		C i32+ now in g4
576	sllx	i48, 32, %l6		C (i48 << 32)
577	or	%i3, %o5, %o5
578	srlx	%g4, 32, %o3		C (i32 >> 32)
579	add	%l5, %l4, %o1		C hi64- in %o1
580	sllx	%g4, 16, %o2		C (i32 << 16)
581	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT
582	sllx	%o1, 48, %o3		C (hi64 << 48)
583	add	%g2, %o2, %o2		C mi64- in %o2
584	add	%l6, %o2, %o2		C mi64- in %o2
585	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT
586	stx	%o5, [%i4+%i2]
587	add	cy, %g5, %o4		C x = prev(i00) + cy
588	add	%i2, 8, %i2
589.L_out_1:
590	srlx	%o4, 16, %o5		C (x >> 16)
591	add	%o5, %o2, %o2		C mi64 in %o2   2nd ASSIGNMENT
592	and	%o4, xffff, %o5		C (x & 0xffff)
593	srlx	%o2, 48, %o7		C (mi64 >> 48)
594	sllx	%o2, 16, %i3		C (mi64 << 16)
595	add	%o7, %o1, cy		C new cy
596	or	%i3, %o5, %o5
597	stx	%o5, [%i4+%i2]
598
599	sllx	i00, 0, %g2
600	add	%g2, cy, cy
601	sllx	i16, 16, %g3
602	add	%g3, cy, cy
603
604	return	%i7+8
605	mov	cy, %o0
606EPILOGUE(mpn_addmul_1)
607