xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/sparc64/ultrasparc1234/mul_1.asm (revision 19ef5b5b0bcb90f63509df6e78769de1b57c2758)
1dnl  SPARC v9 64-bit mpn_mul_1 -- Multiply a limb vector with a limb and store
2dnl  the result in a second limb vector.
3
4dnl  Copyright 1998, 2000, 2001, 2002, 2003 Free Software Foundation, Inc.
5
6dnl  This file is part of the GNU MP Library.
7
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of the GNU Lesser General Public License as published
10dnl  by the Free Software Foundation; either version 3 of the License, or (at
11dnl  your option) any later version.
12
13dnl  The GNU MP Library is distributed in the hope that it will be useful, but
14dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
16dnl  License for more details.
17
18dnl  You should have received a copy of the GNU Lesser General Public License
19dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
20
21include(`../config.m4')
22
23C		   cycles/limb
24C UltraSPARC 1&2:     14
25C UltraSPARC 3:	      18.5
26
27C Algorithm: We use eight floating-point multiplies per limb product, with the
28C invariant v operand split into four 16-bit pieces, and the s1 operand split
29C into 32-bit pieces.  We sum pairs of 48-bit partial products using
30C floating-point add, then convert the four 49-bit product-sums and transfer
31C them to the integer unit.
32
33C Possible optimizations:
34C   1. Align the stack area where we transfer the four 49-bit product-sums
35C      to a 32-byte boundary.  That would minimize the cache collision.
36C      (UltraSPARC-1/2 use a direct-mapped cache.)  (Perhaps even better would
37C      be to align the area to map to the area immediately before s1?)
38C   2. Sum the 4 49-bit quantities using 32-bit operations, as in the
39C      develop mpn_addmul_2.  This would save many integer instructions.
40C   3. Unrolling.  Questionable if it is worth the code expansion, given that
41C      it could only save 1 cycle/limb.
42C   4. Specialize for particular v values.  If its upper 32 bits are zero, we
43C      could save many operations, in the FPU (fmuld), but more so in the IEU
44C      since we'll be summing 48-bit quantities, which might be simpler.
45C   5. Ideally, we should schedule the f2/f3 and f4/f5 RAW further apart, and
46C      the i00,i16,i32,i48 RAW less apart.  The latter apart-scheduling should
47C      not be greater than needed for L2 cache latency, and also not so great
48C      that i16 needs to be copied.
49C   6. Avoid performing mem+fa+fm in the same cycle, at least not when we want
50C      to get high IEU bandwidth.  (12 of the 14 cycles will be free for 2 IEU
51C      ops.)
52
53C Instruction classification (as per UltraSPARC-1/2 functional units):
54C    8 FM
55C   10 FA
56C   11 MEM
57C   9 ISHIFT + 10? IADDLOG
58C    1 BRANCH
59C   49 insns totally (plus three mov insns that should be optimized out)
60
61C The loop executes 53 instructions in 14 cycles on UltraSPARC-1/2, i.e we
62C sustain 3.79 instructions/cycle.
63
64C INPUT PARAMETERS
65C rp	i0
66C up	i1
67C n	i2
68C v	i3
69
70ASM_START()
71	REGISTER(%g2,#scratch)
72	REGISTER(%g3,#scratch)
73
74define(`p00', `%f8') define(`p16',`%f10') define(`p32',`%f12') define(`p48',`%f14')
75define(`r32',`%f16') define(`r48',`%f18') define(`r64',`%f20') define(`r80',`%f22')
76define(`v00',`%f24') define(`v16',`%f26') define(`v32',`%f28') define(`v48',`%f30')
77define(`u00',`%f32') define(`u32', `%f34')
78define(`a00',`%f36') define(`a16',`%f38') define(`a32',`%f40') define(`a48',`%f42')
79define(`cy',`%g1')
80define(`rlimb',`%g3')
81define(`i00',`%l0') define(`i16',`%l1') define(`i32',`%l2') define(`i48',`%l3')
82define(`xffffffff',`%l7')
83define(`xffff',`%o0')
84
85PROLOGUE(mpn_mul_1)
86
87C Initialization.  (1) Split v operand into four 16-bit chunks and store them
88C as IEEE double in fp registers.  (2) Clear upper 32 bits of fp register pairs
89C f2 and f4.  (3) Store masks in registers aliased to `xffff' and `xffffffff'.
90
91	save	%sp, -256, %sp
92	mov	-1, %g4
93	srlx	%g4, 48, xffff		C store mask in register `xffff'
94	and	%i3, xffff, %g2
95	stx	%g2, [%sp+2223+0]
96	srlx	%i3, 16, %g3
97	and	%g3, xffff, %g3
98	stx	%g3, [%sp+2223+8]
99	srlx	%i3, 32, %g2
100	and	%g2, xffff, %g2
101	stx	%g2, [%sp+2223+16]
102	srlx	%i3, 48, %g3
103	stx	%g3, [%sp+2223+24]
104	srlx	%g4, 32, xffffffff	C store mask in register `xffffffff'
105
106	sllx	%i2, 3, %i2
107	mov	0, cy			C clear cy
108	add	%i0, %i2, %i0
109	add	%i1, %i2, %i1
110	neg	%i2
111	add	%i1, 4, %i5
112	add	%i0, -32, %i4
113	add	%i0, -16, %i0
114
115	ldd	[%sp+2223+0], v00
116	ldd	[%sp+2223+8], v16
117	ldd	[%sp+2223+16], v32
118	ldd	[%sp+2223+24], v48
119	ld	[%sp+2223+0],%f2	C zero f2
120	ld	[%sp+2223+0],%f4	C zero f4
121	ld	[%i5+%i2], %f3		C read low 32 bits of up[i]
122	ld	[%i1+%i2], %f5		C read high 32 bits of up[i]
123	fxtod	v00, v00
124	fxtod	v16, v16
125	fxtod	v32, v32
126	fxtod	v48, v48
127
128C Start real work.  (We sneakingly read f3 and f5 above...)
129C The software pipeline is very deep, requiring 4 feed-in stages.
130
131	fxtod	%f2, u00
132	fxtod	%f4, u32
133	fmuld	u00, v00, a00
134	fmuld	u00, v16, a16
135	fmuld	u00, v32, p32
136	fmuld	u32, v00, r32
137	fmuld	u00, v48, p48
138	addcc	%i2, 8, %i2
139	bnz,pt	%xcc, .L_two_or_more
140	fmuld	u32, v16, r48
141
142.L_one:
143	fmuld	u32, v32, r64	C FIXME not urgent
144	faddd	p32, r32, a32
145	fdtox	a00, a00
146	faddd	p48, r48, a48
147	fmuld	u32, v48, r80	C FIXME not urgent
148	fdtox	a16, a16
149	fdtox	a32, a32
150	fdtox	a48, a48
151	std	a00, [%sp+2223+0]
152	std	a16, [%sp+2223+8]
153	std	a32, [%sp+2223+16]
154	std	a48, [%sp+2223+24]
155	add	%i2, 8, %i2
156
157	fdtox	r64, a00
158	fdtox	r80, a16
159	ldx	[%sp+2223+0], i00
160	ldx	[%sp+2223+8], i16
161	ldx	[%sp+2223+16], i32
162	ldx	[%sp+2223+24], i48
163	std	a00, [%sp+2223+0]
164	std	a16, [%sp+2223+8]
165	add	%i2, 8, %i2
166
167	mov	i00, %g5		C i00+ now in g5
168	ldx	[%sp+2223+0], i00
169	srlx	i16, 48, %l4		C (i16 >> 48)
170	mov	i16, %g2
171	ldx	[%sp+2223+8], i16
172	srlx	i48, 16, %l5		C (i48 >> 16)
173	mov	i32, %g4		C i32+ now in g4
174	sllx	i48, 32, %l6		C (i48 << 32)
175	srlx	%g4, 32, %o3		C (i32 >> 32)
176	add	%l5, %l4, %o1		C hi64- in %o1
177	std	a00, [%sp+2223+0]
178	sllx	%g4, 16, %o2		C (i32 << 16)
179	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT
180	std	a16, [%sp+2223+8]
181	sllx	%o1, 48, %o3		C (hi64 << 48)
182	add	%g2, %o2, %o2		C mi64- in %o2
183	add	%l6, %o2, %o2		C mi64- in %o2
184	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT
185	add	cy, %g5, %o4		C x = prev(i00) + cy
186	b	.L_out_1
187	add	%i2, 8, %i2
188
189.L_two_or_more:
190	ld	[%i5+%i2], %f3		C read low 32 bits of up[i]
191	fmuld	u32, v32, r64	C FIXME not urgent
192	faddd	p32, r32, a32
193	ld	[%i1+%i2], %f5		C read high 32 bits of up[i]
194	fdtox	a00, a00
195	faddd	p48, r48, a48
196	fmuld	u32, v48, r80	C FIXME not urgent
197	fdtox	a16, a16
198	fdtox	a32, a32
199	fxtod	%f2, u00
200	fxtod	%f4, u32
201	fdtox	a48, a48
202	std	a00, [%sp+2223+0]
203	fmuld	u00, v00, p00
204	std	a16, [%sp+2223+8]
205	fmuld	u00, v16, p16
206	std	a32, [%sp+2223+16]
207	fmuld	u00, v32, p32
208	std	a48, [%sp+2223+24]
209	faddd	p00, r64, a00
210	fmuld	u32, v00, r32
211	faddd	p16, r80, a16
212	fmuld	u00, v48, p48
213	addcc	%i2, 8, %i2
214	bnz,pt	%xcc, .L_three_or_more
215	fmuld	u32, v16, r48
216
217.L_two:
218	fmuld	u32, v32, r64	C FIXME not urgent
219	faddd	p32, r32, a32
220	fdtox	a00, a00
221	faddd	p48, r48, a48
222	fmuld	u32, v48, r80	C FIXME not urgent
223	fdtox	a16, a16
224	ldx	[%sp+2223+0], i00
225	fdtox	a32, a32
226	ldx	[%sp+2223+8], i16
227	ldx	[%sp+2223+16], i32
228	ldx	[%sp+2223+24], i48
229	fdtox	a48, a48
230	std	a00, [%sp+2223+0]
231	std	a16, [%sp+2223+8]
232	std	a32, [%sp+2223+16]
233	std	a48, [%sp+2223+24]
234	add	%i2, 8, %i2
235
236	fdtox	r64, a00
237	mov	i00, %g5		C i00+ now in g5
238	fdtox	r80, a16
239	ldx	[%sp+2223+0], i00
240	srlx	i16, 48, %l4		C (i16 >> 48)
241	mov	i16, %g2
242	ldx	[%sp+2223+8], i16
243	srlx	i48, 16, %l5		C (i48 >> 16)
244	mov	i32, %g4		C i32+ now in g4
245	ldx	[%sp+2223+16], i32
246	sllx	i48, 32, %l6		C (i48 << 32)
247	ldx	[%sp+2223+24], i48
248	srlx	%g4, 32, %o3		C (i32 >> 32)
249	add	%l5, %l4, %o1		C hi64- in %o1
250	std	a00, [%sp+2223+0]
251	sllx	%g4, 16, %o2		C (i32 << 16)
252	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT
253	std	a16, [%sp+2223+8]
254	sllx	%o1, 48, %o3		C (hi64 << 48)
255	add	%g2, %o2, %o2		C mi64- in %o2
256	add	%l6, %o2, %o2		C mi64- in %o2
257	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT
258	add	cy, %g5, %o4		C x = prev(i00) + cy
259	b	.L_out_2
260	add	%i2, 8, %i2
261
262.L_three_or_more:
263	ld	[%i5+%i2], %f3		C read low 32 bits of up[i]
264	fmuld	u32, v32, r64	C FIXME not urgent
265	faddd	p32, r32, a32
266	ld	[%i1+%i2], %f5		C read high 32 bits of up[i]
267	fdtox	a00, a00
268	faddd	p48, r48, a48
269	fmuld	u32, v48, r80	C FIXME not urgent
270	fdtox	a16, a16
271	ldx	[%sp+2223+0], i00
272	fdtox	a32, a32
273	ldx	[%sp+2223+8], i16
274	fxtod	%f2, u00
275	ldx	[%sp+2223+16], i32
276	fxtod	%f4, u32
277	ldx	[%sp+2223+24], i48
278	fdtox	a48, a48
279	std	a00, [%sp+2223+0]
280	fmuld	u00, v00, p00
281	std	a16, [%sp+2223+8]
282	fmuld	u00, v16, p16
283	std	a32, [%sp+2223+16]
284	fmuld	u00, v32, p32
285	std	a48, [%sp+2223+24]
286	faddd	p00, r64, a00
287	fmuld	u32, v00, r32
288	faddd	p16, r80, a16
289	fmuld	u00, v48, p48
290	addcc	%i2, 8, %i2
291	bnz,pt	%xcc, .L_four_or_more
292	fmuld	u32, v16, r48
293
294.L_three:
295	fmuld	u32, v32, r64	C FIXME not urgent
296	faddd	p32, r32, a32
297	fdtox	a00, a00
298	faddd	p48, r48, a48
299	mov	i00, %g5		C i00+ now in g5
300	fmuld	u32, v48, r80	C FIXME not urgent
301	fdtox	a16, a16
302	ldx	[%sp+2223+0], i00
303	fdtox	a32, a32
304	srlx	i16, 48, %l4		C (i16 >> 48)
305	mov	i16, %g2
306	ldx	[%sp+2223+8], i16
307	srlx	i48, 16, %l5		C (i48 >> 16)
308	mov	i32, %g4		C i32+ now in g4
309	ldx	[%sp+2223+16], i32
310	sllx	i48, 32, %l6		C (i48 << 32)
311	ldx	[%sp+2223+24], i48
312	fdtox	a48, a48
313	srlx	%g4, 32, %o3		C (i32 >> 32)
314	add	%l5, %l4, %o1		C hi64- in %o1
315	std	a00, [%sp+2223+0]
316	sllx	%g4, 16, %o2		C (i32 << 16)
317	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT
318	std	a16, [%sp+2223+8]
319	sllx	%o1, 48, %o3		C (hi64 << 48)
320	add	%g2, %o2, %o2		C mi64- in %o2
321	std	a32, [%sp+2223+16]
322	add	%l6, %o2, %o2		C mi64- in %o2
323	std	a48, [%sp+2223+24]
324	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT
325	add	cy, %g5, %o4		C x = prev(i00) + cy
326	b	.L_out_3
327	add	%i2, 8, %i2
328
329.L_four_or_more:
330	ld	[%i5+%i2], %f3		C read low 32 bits of up[i]
331	fmuld	u32, v32, r64	C FIXME not urgent
332	faddd	p32, r32, a32
333	ld	[%i1+%i2], %f5		C read high 32 bits of up[i]
334	fdtox	a00, a00
335	faddd	p48, r48, a48
336	mov	i00, %g5		C i00+ now in g5
337	fmuld	u32, v48, r80	C FIXME not urgent
338	fdtox	a16, a16
339	ldx	[%sp+2223+0], i00
340	fdtox	a32, a32
341	srlx	i16, 48, %l4		C (i16 >> 48)
342	mov	i16, %g2
343	ldx	[%sp+2223+8], i16
344	fxtod	%f2, u00
345	srlx	i48, 16, %l5		C (i48 >> 16)
346	mov	i32, %g4		C i32+ now in g4
347	ldx	[%sp+2223+16], i32
348	fxtod	%f4, u32
349	sllx	i48, 32, %l6		C (i48 << 32)
350	ldx	[%sp+2223+24], i48
351	fdtox	a48, a48
352	srlx	%g4, 32, %o3		C (i32 >> 32)
353	add	%l5, %l4, %o1		C hi64- in %o1
354	std	a00, [%sp+2223+0]
355	fmuld	u00, v00, p00
356	sllx	%g4, 16, %o2		C (i32 << 16)
357	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT
358	std	a16, [%sp+2223+8]
359	fmuld	u00, v16, p16
360	sllx	%o1, 48, %o3		C (hi64 << 48)
361	add	%g2, %o2, %o2		C mi64- in %o2
362	std	a32, [%sp+2223+16]
363	fmuld	u00, v32, p32
364	add	%l6, %o2, %o2		C mi64- in %o2
365	std	a48, [%sp+2223+24]
366	faddd	p00, r64, a00
367	fmuld	u32, v00, r32
368	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT
369	faddd	p16, r80, a16
370	fmuld	u00, v48, p48
371	add	cy, %g5, %o4		C x = prev(i00) + cy
372	addcc	%i2, 8, %i2
373	bnz,pt	%xcc, .Loop
374	fmuld	u32, v16, r48
375
376.L_four:
377	b,a	.L_out_4
378
379C BEGIN MAIN LOOP
380	.align	16
381.Loop:
382C 00
383	srlx	%o4, 16, %o5		C (x >> 16)
384	ld	[%i5+%i2], %f3		C read low 32 bits of up[i]
385	fmuld	u32, v32, r64	C FIXME not urgent
386	faddd	p32, r32, a32
387C 01
388	add	%o5, %o2, %o2		C mi64 in %o2   2nd ASSIGNMENT
389	and	%o4, xffff, %o5		C (x & 0xffff)
390	ld	[%i1+%i2], %f5		C read high 32 bits of up[i]
391	fdtox	a00, a00
392C 02
393	faddd	p48, r48, a48
394C 03
395	srlx	%o2, 48, %o7		C (mi64 >> 48)
396	mov	i00, %g5		C i00+ now in g5
397	fmuld	u32, v48, r80	C FIXME not urgent
398	fdtox	a16, a16
399C 04
400	sllx	%o2, 16, %i3		C (mi64 << 16)
401	add	%o7, %o1, cy		C new cy
402	ldx	[%sp+2223+0], i00
403	fdtox	a32, a32
404C 05
405	srlx	i16, 48, %l4		C (i16 >> 48)
406	mov	i16, %g2
407	ldx	[%sp+2223+8], i16
408	fxtod	%f2, u00
409C 06
410	srlx	i48, 16, %l5		C (i48 >> 16)
411	mov	i32, %g4		C i32+ now in g4
412	ldx	[%sp+2223+16], i32
413	fxtod	%f4, u32
414C 07
415	sllx	i48, 32, %l6		C (i48 << 32)
416	or	%i3, %o5, %o5
417	ldx	[%sp+2223+24], i48
418	fdtox	a48, a48
419C 08
420	srlx	%g4, 32, %o3		C (i32 >> 32)
421	add	%l5, %l4, %o1		C hi64- in %o1
422	std	a00, [%sp+2223+0]
423	fmuld	u00, v00, p00
424C 09
425	sllx	%g4, 16, %o2		C (i32 << 16)
426	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT
427	std	a16, [%sp+2223+8]
428	fmuld	u00, v16, p16
429C 10
430	sllx	%o1, 48, %o3		C (hi64 << 48)
431	add	%g2, %o2, %o2		C mi64- in %o2
432	std	a32, [%sp+2223+16]
433	fmuld	u00, v32, p32
434C 11
435	add	%l6, %o2, %o2		C mi64- in %o2
436	std	a48, [%sp+2223+24]
437	faddd	p00, r64, a00
438	fmuld	u32, v00, r32
439C 12
440	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT
441	stx	%o5, [%i4+%i2]
442	faddd	p16, r80, a16
443	fmuld	u00, v48, p48
444C 13
445	add	cy, %g5, %o4		C x = prev(i00) + cy
446	addcc	%i2, 8, %i2
447	bnz,pt	%xcc, .Loop
448	fmuld	u32, v16, r48
449C END MAIN LOOP
450
451.L_out_4:
452	srlx	%o4, 16, %o5		C (x >> 16)
453	fmuld	u32, v32, r64	C FIXME not urgent
454	faddd	p32, r32, a32
455	add	%o5, %o2, %o2		C mi64 in %o2   2nd ASSIGNMENT
456	and	%o4, xffff, %o5		C (x & 0xffff)
457	fdtox	a00, a00
458	faddd	p48, r48, a48
459	srlx	%o2, 48, %o7		C (mi64 >> 48)
460	mov	i00, %g5		C i00+ now in g5
461	fmuld	u32, v48, r80	C FIXME not urgent
462	fdtox	a16, a16
463	sllx	%o2, 16, %i3		C (mi64 << 16)
464	add	%o7, %o1, cy		C new cy
465	ldx	[%sp+2223+0], i00
466	fdtox	a32, a32
467	srlx	i16, 48, %l4		C (i16 >> 48)
468	mov	i16, %g2
469	ldx	[%sp+2223+8], i16
470	srlx	i48, 16, %l5		C (i48 >> 16)
471	mov	i32, %g4		C i32+ now in g4
472	ldx	[%sp+2223+16], i32
473	sllx	i48, 32, %l6		C (i48 << 32)
474	or	%i3, %o5, %o5
475	ldx	[%sp+2223+24], i48
476	fdtox	a48, a48
477	srlx	%g4, 32, %o3		C (i32 >> 32)
478	add	%l5, %l4, %o1		C hi64- in %o1
479	std	a00, [%sp+2223+0]
480	sllx	%g4, 16, %o2		C (i32 << 16)
481	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT
482	std	a16, [%sp+2223+8]
483	sllx	%o1, 48, %o3		C (hi64 << 48)
484	add	%g2, %o2, %o2		C mi64- in %o2
485	std	a32, [%sp+2223+16]
486	add	%l6, %o2, %o2		C mi64- in %o2
487	std	a48, [%sp+2223+24]
488	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT
489	stx	%o5, [%i4+%i2]
490	add	cy, %g5, %o4		C x = prev(i00) + cy
491	add	%i2, 8, %i2
492.L_out_3:
493	srlx	%o4, 16, %o5		C (x >> 16)
494	add	%o5, %o2, %o2		C mi64 in %o2   2nd ASSIGNMENT
495	and	%o4, xffff, %o5		C (x & 0xffff)
496	fdtox	r64, a00
497	srlx	%o2, 48, %o7		C (mi64 >> 48)
498	mov	i00, %g5		C i00+ now in g5
499	fdtox	r80, a16
500	sllx	%o2, 16, %i3		C (mi64 << 16)
501	add	%o7, %o1, cy		C new cy
502	ldx	[%sp+2223+0], i00
503	srlx	i16, 48, %l4		C (i16 >> 48)
504	mov	i16, %g2
505	ldx	[%sp+2223+8], i16
506	srlx	i48, 16, %l5		C (i48 >> 16)
507	mov	i32, %g4		C i32+ now in g4
508	ldx	[%sp+2223+16], i32
509	sllx	i48, 32, %l6		C (i48 << 32)
510	or	%i3, %o5, %o5
511	ldx	[%sp+2223+24], i48
512	srlx	%g4, 32, %o3		C (i32 >> 32)
513	add	%l5, %l4, %o1		C hi64- in %o1
514	std	a00, [%sp+2223+0]
515	sllx	%g4, 16, %o2		C (i32 << 16)
516	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT
517	std	a16, [%sp+2223+8]
518	sllx	%o1, 48, %o3		C (hi64 << 48)
519	add	%g2, %o2, %o2		C mi64- in %o2
520	add	%l6, %o2, %o2		C mi64- in %o2
521	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT
522	stx	%o5, [%i4+%i2]
523	add	cy, %g5, %o4		C x = prev(i00) + cy
524	add	%i2, 8, %i2
525.L_out_2:
526	srlx	%o4, 16, %o5		C (x >> 16)
527	add	%o5, %o2, %o2		C mi64 in %o2   2nd ASSIGNMENT
528	and	%o4, xffff, %o5		C (x & 0xffff)
529	srlx	%o2, 48, %o7		C (mi64 >> 48)
530	mov	i00, %g5		C i00+ now in g5
531	sllx	%o2, 16, %i3		C (mi64 << 16)
532	add	%o7, %o1, cy		C new cy
533	ldx	[%sp+2223+0], i00
534	srlx	i16, 48, %l4		C (i16 >> 48)
535	mov	i16, %g2
536	ldx	[%sp+2223+8], i16
537	srlx	i48, 16, %l5		C (i48 >> 16)
538	mov	i32, %g4		C i32+ now in g4
539	sllx	i48, 32, %l6		C (i48 << 32)
540	or	%i3, %o5, %o5
541	srlx	%g4, 32, %o3		C (i32 >> 32)
542	add	%l5, %l4, %o1		C hi64- in %o1
543	sllx	%g4, 16, %o2		C (i32 << 16)
544	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT
545	sllx	%o1, 48, %o3		C (hi64 << 48)
546	add	%g2, %o2, %o2		C mi64- in %o2
547	add	%l6, %o2, %o2		C mi64- in %o2
548	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT
549	stx	%o5, [%i4+%i2]
550	add	cy, %g5, %o4		C x = prev(i00) + cy
551	add	%i2, 8, %i2
552.L_out_1:
553	srlx	%o4, 16, %o5		C (x >> 16)
554	add	%o5, %o2, %o2		C mi64 in %o2   2nd ASSIGNMENT
555	and	%o4, xffff, %o5		C (x & 0xffff)
556	srlx	%o2, 48, %o7		C (mi64 >> 48)
557	sllx	%o2, 16, %i3		C (mi64 << 16)
558	add	%o7, %o1, cy		C new cy
559	or	%i3, %o5, %o5
560	stx	%o5, [%i4+%i2]
561
562	sllx	i00, 0, %g2
563	add	%g2, cy, cy
564	sllx	i16, 16, %g3
565	add	%g3, cy, cy
566
567	return	%i7+8
568	mov	cy, %o0
569EPILOGUE(mpn_mul_1)
570