xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/sparc64/ultrasparc1234/mul_1.asm (revision d16b7486a53dcb8072b60ec6fcb4373a2d0c27b7)
1dnl  SPARC v9 64-bit mpn_mul_1 -- Multiply a limb vector with a limb and store
2dnl  the result in a second limb vector.
3
4dnl  Copyright 1998, 2000-2003 Free Software Foundation, Inc.
5
6dnl  This file is part of the GNU MP Library.
7dnl
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of either:
10dnl
11dnl    * the GNU Lesser General Public License as published by the Free
12dnl      Software Foundation; either version 3 of the License, or (at your
13dnl      option) any later version.
14dnl
15dnl  or
16dnl
17dnl    * the GNU General Public License as published by the Free Software
18dnl      Foundation; either version 2 of the License, or (at your option) any
19dnl      later version.
20dnl
21dnl  or both in parallel, as here.
22dnl
23dnl  The GNU MP Library is distributed in the hope that it will be useful, but
24dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
26dnl  for more details.
27dnl
28dnl  You should have received copies of the GNU General Public License and the
29dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
30dnl  see https://www.gnu.org/licenses/.
31
32include(`../config.m4')
33
34C		   cycles/limb
35C UltraSPARC 1&2:     14
36C UltraSPARC 3:	      18.5
37
38C Algorithm: We use eight floating-point multiplies per limb product, with the
39C invariant v operand split into four 16-bit pieces, and the s1 operand split
40C into 32-bit pieces.  We sum pairs of 48-bit partial products using
41C floating-point add, then convert the four 49-bit product-sums and transfer
42C them to the integer unit.
43
44C Possible optimizations:
45C   1. Align the stack area where we transfer the four 49-bit product-sums
46C      to a 32-byte boundary.  That would minimize the cache collision.
47C      (UltraSPARC-1/2 use a direct-mapped cache.)  (Perhaps even better would
48C      be to align the area to map to the area immediately before s1?)
49C   2. Sum the 4 49-bit quantities using 32-bit operations, as in the
50C      develop mpn_addmul_2.  This would save many integer instructions.
51C   3. Unrolling.  Questionable if it is worth the code expansion, given that
52C      it could only save 1 cycle/limb.
53C   4. Specialize for particular v values.  If its upper 32 bits are zero, we
54C      could save many operations, in the FPU (fmuld), but more so in the IEU
55C      since we'll be summing 48-bit quantities, which might be simpler.
56C   5. Ideally, we should schedule the f2/f3 and f4/f5 RAW further apart, and
57C      the i00,i16,i32,i48 RAW less apart.  The latter apart-scheduling should
58C      not be greater than needed for L2 cache latency, and also not so great
59C      that i16 needs to be copied.
60C   6. Avoid performing mem+fa+fm in the same cycle, at least not when we want
61C      to get high IEU bandwidth.  (12 of the 14 cycles will be free for 2 IEU
62C      ops.)
63
64C Instruction classification (as per UltraSPARC-1/2 functional units):
65C    8 FM
66C   10 FA
67C   11 MEM
68C   9 ISHIFT + 10? IADDLOG
69C    1 BRANCH
70C   49 insns totally (plus three mov insns that should be optimized out)
71
72C The loop executes 53 instructions in 14 cycles on UltraSPARC-1/2, i.e we
73C sustain 3.79 instructions/cycle.
74
75C INPUT PARAMETERS
76C rp	i0
77C up	i1
78C n	i2
79C v	i3
80
81ASM_START()
82	REGISTER(%g2,#scratch)
83	REGISTER(%g3,#scratch)
84
85define(`p00', `%f8') define(`p16',`%f10') define(`p32',`%f12') define(`p48',`%f14')
86define(`r32',`%f16') define(`r48',`%f18') define(`r64',`%f20') define(`r80',`%f22')
87define(`v00',`%f24') define(`v16',`%f26') define(`v32',`%f28') define(`v48',`%f30')
88define(`u00',`%f32') define(`u32', `%f34')
89define(`a00',`%f36') define(`a16',`%f38') define(`a32',`%f40') define(`a48',`%f42')
90define(`cy',`%g1')
91define(`rlimb',`%g3')
92define(`i00',`%l0') define(`i16',`%l1') define(`i32',`%l2') define(`i48',`%l3')
93define(`xffffffff',`%l7')
94define(`xffff',`%o0')
95
96PROLOGUE(mpn_mul_1)
97
98C Initialization.  (1) Split v operand into four 16-bit chunks and store them
99C as IEEE double in fp registers.  (2) Clear upper 32 bits of fp register pairs
100C f2 and f4.  (3) Store masks in registers aliased to `xffff' and `xffffffff'.
101
102	save	%sp, -256, %sp
103	mov	-1, %g4
104	srlx	%g4, 48, xffff		C store mask in register `xffff'
105	and	%i3, xffff, %g2
106	stx	%g2, [%sp+2223+0]
107	srlx	%i3, 16, %g3
108	and	%g3, xffff, %g3
109	stx	%g3, [%sp+2223+8]
110	srlx	%i3, 32, %g2
111	and	%g2, xffff, %g2
112	stx	%g2, [%sp+2223+16]
113	srlx	%i3, 48, %g3
114	stx	%g3, [%sp+2223+24]
115	srlx	%g4, 32, xffffffff	C store mask in register `xffffffff'
116
117	sllx	%i2, 3, %i2
118	mov	0, cy			C clear cy
119	add	%i0, %i2, %i0
120	add	%i1, %i2, %i1
121	neg	%i2
122	add	%i1, 4, %i5
123	add	%i0, -32, %i4
124	add	%i0, -16, %i0
125
126	ldd	[%sp+2223+0], v00
127	ldd	[%sp+2223+8], v16
128	ldd	[%sp+2223+16], v32
129	ldd	[%sp+2223+24], v48
130	ld	[%sp+2223+0],%f2	C zero f2
131	ld	[%sp+2223+0],%f4	C zero f4
132	ld	[%i5+%i2], %f3		C read low 32 bits of up[i]
133	ld	[%i1+%i2], %f5		C read high 32 bits of up[i]
134	fxtod	v00, v00
135	fxtod	v16, v16
136	fxtod	v32, v32
137	fxtod	v48, v48
138
139C Start real work.  (We sneakingly read f3 and f5 above...)
140C The software pipeline is very deep, requiring 4 feed-in stages.
141
142	fxtod	%f2, u00
143	fxtod	%f4, u32
144	fmuld	u00, v00, a00
145	fmuld	u00, v16, a16
146	fmuld	u00, v32, p32
147	fmuld	u32, v00, r32
148	fmuld	u00, v48, p48
149	addcc	%i2, 8, %i2
150	bnz,pt	%xcc, .L_two_or_more
151	fmuld	u32, v16, r48
152
153.L_one:
154	fmuld	u32, v32, r64	C FIXME not urgent
155	faddd	p32, r32, a32
156	fdtox	a00, a00
157	faddd	p48, r48, a48
158	fmuld	u32, v48, r80	C FIXME not urgent
159	fdtox	a16, a16
160	fdtox	a32, a32
161	fdtox	a48, a48
162	std	a00, [%sp+2223+0]
163	std	a16, [%sp+2223+8]
164	std	a32, [%sp+2223+16]
165	std	a48, [%sp+2223+24]
166	add	%i2, 8, %i2
167
168	fdtox	r64, a00
169	fdtox	r80, a16
170	ldx	[%sp+2223+0], i00
171	ldx	[%sp+2223+8], i16
172	ldx	[%sp+2223+16], i32
173	ldx	[%sp+2223+24], i48
174	std	a00, [%sp+2223+0]
175	std	a16, [%sp+2223+8]
176	add	%i2, 8, %i2
177
178	mov	i00, %g5		C i00+ now in g5
179	ldx	[%sp+2223+0], i00
180	srlx	i16, 48, %l4		C (i16 >> 48)
181	mov	i16, %g2
182	ldx	[%sp+2223+8], i16
183	srlx	i48, 16, %l5		C (i48 >> 16)
184	mov	i32, %g4		C i32+ now in g4
185	sllx	i48, 32, %l6		C (i48 << 32)
186	srlx	%g4, 32, %o3		C (i32 >> 32)
187	add	%l5, %l4, %o1		C hi64- in %o1
188	std	a00, [%sp+2223+0]
189	sllx	%g4, 16, %o2		C (i32 << 16)
190	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT
191	std	a16, [%sp+2223+8]
192	sllx	%o1, 48, %o3		C (hi64 << 48)
193	add	%g2, %o2, %o2		C mi64- in %o2
194	add	%l6, %o2, %o2		C mi64- in %o2
195	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT
196	add	cy, %g5, %o4		C x = prev(i00) + cy
197	b	.L_out_1
198	add	%i2, 8, %i2
199
200.L_two_or_more:
201	ld	[%i5+%i2], %f3		C read low 32 bits of up[i]
202	fmuld	u32, v32, r64	C FIXME not urgent
203	faddd	p32, r32, a32
204	ld	[%i1+%i2], %f5		C read high 32 bits of up[i]
205	fdtox	a00, a00
206	faddd	p48, r48, a48
207	fmuld	u32, v48, r80	C FIXME not urgent
208	fdtox	a16, a16
209	fdtox	a32, a32
210	fxtod	%f2, u00
211	fxtod	%f4, u32
212	fdtox	a48, a48
213	std	a00, [%sp+2223+0]
214	fmuld	u00, v00, p00
215	std	a16, [%sp+2223+8]
216	fmuld	u00, v16, p16
217	std	a32, [%sp+2223+16]
218	fmuld	u00, v32, p32
219	std	a48, [%sp+2223+24]
220	faddd	p00, r64, a00
221	fmuld	u32, v00, r32
222	faddd	p16, r80, a16
223	fmuld	u00, v48, p48
224	addcc	%i2, 8, %i2
225	bnz,pt	%xcc, .L_three_or_more
226	fmuld	u32, v16, r48
227
228.L_two:
229	fmuld	u32, v32, r64	C FIXME not urgent
230	faddd	p32, r32, a32
231	fdtox	a00, a00
232	faddd	p48, r48, a48
233	fmuld	u32, v48, r80	C FIXME not urgent
234	fdtox	a16, a16
235	ldx	[%sp+2223+0], i00
236	fdtox	a32, a32
237	ldx	[%sp+2223+8], i16
238	ldx	[%sp+2223+16], i32
239	ldx	[%sp+2223+24], i48
240	fdtox	a48, a48
241	std	a00, [%sp+2223+0]
242	std	a16, [%sp+2223+8]
243	std	a32, [%sp+2223+16]
244	std	a48, [%sp+2223+24]
245	add	%i2, 8, %i2
246
247	fdtox	r64, a00
248	mov	i00, %g5		C i00+ now in g5
249	fdtox	r80, a16
250	ldx	[%sp+2223+0], i00
251	srlx	i16, 48, %l4		C (i16 >> 48)
252	mov	i16, %g2
253	ldx	[%sp+2223+8], i16
254	srlx	i48, 16, %l5		C (i48 >> 16)
255	mov	i32, %g4		C i32+ now in g4
256	ldx	[%sp+2223+16], i32
257	sllx	i48, 32, %l6		C (i48 << 32)
258	ldx	[%sp+2223+24], i48
259	srlx	%g4, 32, %o3		C (i32 >> 32)
260	add	%l5, %l4, %o1		C hi64- in %o1
261	std	a00, [%sp+2223+0]
262	sllx	%g4, 16, %o2		C (i32 << 16)
263	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT
264	std	a16, [%sp+2223+8]
265	sllx	%o1, 48, %o3		C (hi64 << 48)
266	add	%g2, %o2, %o2		C mi64- in %o2
267	add	%l6, %o2, %o2		C mi64- in %o2
268	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT
269	add	cy, %g5, %o4		C x = prev(i00) + cy
270	b	.L_out_2
271	add	%i2, 8, %i2
272
273.L_three_or_more:
274	ld	[%i5+%i2], %f3		C read low 32 bits of up[i]
275	fmuld	u32, v32, r64	C FIXME not urgent
276	faddd	p32, r32, a32
277	ld	[%i1+%i2], %f5		C read high 32 bits of up[i]
278	fdtox	a00, a00
279	faddd	p48, r48, a48
280	fmuld	u32, v48, r80	C FIXME not urgent
281	fdtox	a16, a16
282	ldx	[%sp+2223+0], i00
283	fdtox	a32, a32
284	ldx	[%sp+2223+8], i16
285	fxtod	%f2, u00
286	ldx	[%sp+2223+16], i32
287	fxtod	%f4, u32
288	ldx	[%sp+2223+24], i48
289	fdtox	a48, a48
290	std	a00, [%sp+2223+0]
291	fmuld	u00, v00, p00
292	std	a16, [%sp+2223+8]
293	fmuld	u00, v16, p16
294	std	a32, [%sp+2223+16]
295	fmuld	u00, v32, p32
296	std	a48, [%sp+2223+24]
297	faddd	p00, r64, a00
298	fmuld	u32, v00, r32
299	faddd	p16, r80, a16
300	fmuld	u00, v48, p48
301	addcc	%i2, 8, %i2
302	bnz,pt	%xcc, .L_four_or_more
303	fmuld	u32, v16, r48
304
305.L_three:
306	fmuld	u32, v32, r64	C FIXME not urgent
307	faddd	p32, r32, a32
308	fdtox	a00, a00
309	faddd	p48, r48, a48
310	mov	i00, %g5		C i00+ now in g5
311	fmuld	u32, v48, r80	C FIXME not urgent
312	fdtox	a16, a16
313	ldx	[%sp+2223+0], i00
314	fdtox	a32, a32
315	srlx	i16, 48, %l4		C (i16 >> 48)
316	mov	i16, %g2
317	ldx	[%sp+2223+8], i16
318	srlx	i48, 16, %l5		C (i48 >> 16)
319	mov	i32, %g4		C i32+ now in g4
320	ldx	[%sp+2223+16], i32
321	sllx	i48, 32, %l6		C (i48 << 32)
322	ldx	[%sp+2223+24], i48
323	fdtox	a48, a48
324	srlx	%g4, 32, %o3		C (i32 >> 32)
325	add	%l5, %l4, %o1		C hi64- in %o1
326	std	a00, [%sp+2223+0]
327	sllx	%g4, 16, %o2		C (i32 << 16)
328	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT
329	std	a16, [%sp+2223+8]
330	sllx	%o1, 48, %o3		C (hi64 << 48)
331	add	%g2, %o2, %o2		C mi64- in %o2
332	std	a32, [%sp+2223+16]
333	add	%l6, %o2, %o2		C mi64- in %o2
334	std	a48, [%sp+2223+24]
335	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT
336	add	cy, %g5, %o4		C x = prev(i00) + cy
337	b	.L_out_3
338	add	%i2, 8, %i2
339
340.L_four_or_more:
341	ld	[%i5+%i2], %f3		C read low 32 bits of up[i]
342	fmuld	u32, v32, r64	C FIXME not urgent
343	faddd	p32, r32, a32
344	ld	[%i1+%i2], %f5		C read high 32 bits of up[i]
345	fdtox	a00, a00
346	faddd	p48, r48, a48
347	mov	i00, %g5		C i00+ now in g5
348	fmuld	u32, v48, r80	C FIXME not urgent
349	fdtox	a16, a16
350	ldx	[%sp+2223+0], i00
351	fdtox	a32, a32
352	srlx	i16, 48, %l4		C (i16 >> 48)
353	mov	i16, %g2
354	ldx	[%sp+2223+8], i16
355	fxtod	%f2, u00
356	srlx	i48, 16, %l5		C (i48 >> 16)
357	mov	i32, %g4		C i32+ now in g4
358	ldx	[%sp+2223+16], i32
359	fxtod	%f4, u32
360	sllx	i48, 32, %l6		C (i48 << 32)
361	ldx	[%sp+2223+24], i48
362	fdtox	a48, a48
363	srlx	%g4, 32, %o3		C (i32 >> 32)
364	add	%l5, %l4, %o1		C hi64- in %o1
365	std	a00, [%sp+2223+0]
366	fmuld	u00, v00, p00
367	sllx	%g4, 16, %o2		C (i32 << 16)
368	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT
369	std	a16, [%sp+2223+8]
370	fmuld	u00, v16, p16
371	sllx	%o1, 48, %o3		C (hi64 << 48)
372	add	%g2, %o2, %o2		C mi64- in %o2
373	std	a32, [%sp+2223+16]
374	fmuld	u00, v32, p32
375	add	%l6, %o2, %o2		C mi64- in %o2
376	std	a48, [%sp+2223+24]
377	faddd	p00, r64, a00
378	fmuld	u32, v00, r32
379	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT
380	faddd	p16, r80, a16
381	fmuld	u00, v48, p48
382	add	cy, %g5, %o4		C x = prev(i00) + cy
383	addcc	%i2, 8, %i2
384	bnz,pt	%xcc, .Loop
385	fmuld	u32, v16, r48
386
387.L_four:
388	b,a	.L_out_4
389
390C BEGIN MAIN LOOP
391	.align	16
392.Loop:
393C 00
394	srlx	%o4, 16, %o5		C (x >> 16)
395	ld	[%i5+%i2], %f3		C read low 32 bits of up[i]
396	fmuld	u32, v32, r64	C FIXME not urgent
397	faddd	p32, r32, a32
398C 01
399	add	%o5, %o2, %o2		C mi64 in %o2   2nd ASSIGNMENT
400	and	%o4, xffff, %o5		C (x & 0xffff)
401	ld	[%i1+%i2], %f5		C read high 32 bits of up[i]
402	fdtox	a00, a00
403C 02
404	faddd	p48, r48, a48
405C 03
406	srlx	%o2, 48, %o7		C (mi64 >> 48)
407	mov	i00, %g5		C i00+ now in g5
408	fmuld	u32, v48, r80	C FIXME not urgent
409	fdtox	a16, a16
410C 04
411	sllx	%o2, 16, %i3		C (mi64 << 16)
412	add	%o7, %o1, cy		C new cy
413	ldx	[%sp+2223+0], i00
414	fdtox	a32, a32
415C 05
416	srlx	i16, 48, %l4		C (i16 >> 48)
417	mov	i16, %g2
418	ldx	[%sp+2223+8], i16
419	fxtod	%f2, u00
420C 06
421	srlx	i48, 16, %l5		C (i48 >> 16)
422	mov	i32, %g4		C i32+ now in g4
423	ldx	[%sp+2223+16], i32
424	fxtod	%f4, u32
425C 07
426	sllx	i48, 32, %l6		C (i48 << 32)
427	or	%i3, %o5, %o5
428	ldx	[%sp+2223+24], i48
429	fdtox	a48, a48
430C 08
431	srlx	%g4, 32, %o3		C (i32 >> 32)
432	add	%l5, %l4, %o1		C hi64- in %o1
433	std	a00, [%sp+2223+0]
434	fmuld	u00, v00, p00
435C 09
436	sllx	%g4, 16, %o2		C (i32 << 16)
437	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT
438	std	a16, [%sp+2223+8]
439	fmuld	u00, v16, p16
440C 10
441	sllx	%o1, 48, %o3		C (hi64 << 48)
442	add	%g2, %o2, %o2		C mi64- in %o2
443	std	a32, [%sp+2223+16]
444	fmuld	u00, v32, p32
445C 11
446	add	%l6, %o2, %o2		C mi64- in %o2
447	std	a48, [%sp+2223+24]
448	faddd	p00, r64, a00
449	fmuld	u32, v00, r32
450C 12
451	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT
452	stx	%o5, [%i4+%i2]
453	faddd	p16, r80, a16
454	fmuld	u00, v48, p48
455C 13
456	add	cy, %g5, %o4		C x = prev(i00) + cy
457	addcc	%i2, 8, %i2
458	bnz,pt	%xcc, .Loop
459	fmuld	u32, v16, r48
460C END MAIN LOOP
461
462.L_out_4:
463	srlx	%o4, 16, %o5		C (x >> 16)
464	fmuld	u32, v32, r64	C FIXME not urgent
465	faddd	p32, r32, a32
466	add	%o5, %o2, %o2		C mi64 in %o2   2nd ASSIGNMENT
467	and	%o4, xffff, %o5		C (x & 0xffff)
468	fdtox	a00, a00
469	faddd	p48, r48, a48
470	srlx	%o2, 48, %o7		C (mi64 >> 48)
471	mov	i00, %g5		C i00+ now in g5
472	fmuld	u32, v48, r80	C FIXME not urgent
473	fdtox	a16, a16
474	sllx	%o2, 16, %i3		C (mi64 << 16)
475	add	%o7, %o1, cy		C new cy
476	ldx	[%sp+2223+0], i00
477	fdtox	a32, a32
478	srlx	i16, 48, %l4		C (i16 >> 48)
479	mov	i16, %g2
480	ldx	[%sp+2223+8], i16
481	srlx	i48, 16, %l5		C (i48 >> 16)
482	mov	i32, %g4		C i32+ now in g4
483	ldx	[%sp+2223+16], i32
484	sllx	i48, 32, %l6		C (i48 << 32)
485	or	%i3, %o5, %o5
486	ldx	[%sp+2223+24], i48
487	fdtox	a48, a48
488	srlx	%g4, 32, %o3		C (i32 >> 32)
489	add	%l5, %l4, %o1		C hi64- in %o1
490	std	a00, [%sp+2223+0]
491	sllx	%g4, 16, %o2		C (i32 << 16)
492	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT
493	std	a16, [%sp+2223+8]
494	sllx	%o1, 48, %o3		C (hi64 << 48)
495	add	%g2, %o2, %o2		C mi64- in %o2
496	std	a32, [%sp+2223+16]
497	add	%l6, %o2, %o2		C mi64- in %o2
498	std	a48, [%sp+2223+24]
499	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT
500	stx	%o5, [%i4+%i2]
501	add	cy, %g5, %o4		C x = prev(i00) + cy
502	add	%i2, 8, %i2
503.L_out_3:
504	srlx	%o4, 16, %o5		C (x >> 16)
505	add	%o5, %o2, %o2		C mi64 in %o2   2nd ASSIGNMENT
506	and	%o4, xffff, %o5		C (x & 0xffff)
507	fdtox	r64, a00
508	srlx	%o2, 48, %o7		C (mi64 >> 48)
509	mov	i00, %g5		C i00+ now in g5
510	fdtox	r80, a16
511	sllx	%o2, 16, %i3		C (mi64 << 16)
512	add	%o7, %o1, cy		C new cy
513	ldx	[%sp+2223+0], i00
514	srlx	i16, 48, %l4		C (i16 >> 48)
515	mov	i16, %g2
516	ldx	[%sp+2223+8], i16
517	srlx	i48, 16, %l5		C (i48 >> 16)
518	mov	i32, %g4		C i32+ now in g4
519	ldx	[%sp+2223+16], i32
520	sllx	i48, 32, %l6		C (i48 << 32)
521	or	%i3, %o5, %o5
522	ldx	[%sp+2223+24], i48
523	srlx	%g4, 32, %o3		C (i32 >> 32)
524	add	%l5, %l4, %o1		C hi64- in %o1
525	std	a00, [%sp+2223+0]
526	sllx	%g4, 16, %o2		C (i32 << 16)
527	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT
528	std	a16, [%sp+2223+8]
529	sllx	%o1, 48, %o3		C (hi64 << 48)
530	add	%g2, %o2, %o2		C mi64- in %o2
531	add	%l6, %o2, %o2		C mi64- in %o2
532	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT
533	stx	%o5, [%i4+%i2]
534	add	cy, %g5, %o4		C x = prev(i00) + cy
535	add	%i2, 8, %i2
536.L_out_2:
537	srlx	%o4, 16, %o5		C (x >> 16)
538	add	%o5, %o2, %o2		C mi64 in %o2   2nd ASSIGNMENT
539	and	%o4, xffff, %o5		C (x & 0xffff)
540	srlx	%o2, 48, %o7		C (mi64 >> 48)
541	mov	i00, %g5		C i00+ now in g5
542	sllx	%o2, 16, %i3		C (mi64 << 16)
543	add	%o7, %o1, cy		C new cy
544	ldx	[%sp+2223+0], i00
545	srlx	i16, 48, %l4		C (i16 >> 48)
546	mov	i16, %g2
547	ldx	[%sp+2223+8], i16
548	srlx	i48, 16, %l5		C (i48 >> 16)
549	mov	i32, %g4		C i32+ now in g4
550	sllx	i48, 32, %l6		C (i48 << 32)
551	or	%i3, %o5, %o5
552	srlx	%g4, 32, %o3		C (i32 >> 32)
553	add	%l5, %l4, %o1		C hi64- in %o1
554	sllx	%g4, 16, %o2		C (i32 << 16)
555	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT
556	sllx	%o1, 48, %o3		C (hi64 << 48)
557	add	%g2, %o2, %o2		C mi64- in %o2
558	add	%l6, %o2, %o2		C mi64- in %o2
559	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT
560	stx	%o5, [%i4+%i2]
561	add	cy, %g5, %o4		C x = prev(i00) + cy
562	add	%i2, 8, %i2
563.L_out_1:
564	srlx	%o4, 16, %o5		C (x >> 16)
565	add	%o5, %o2, %o2		C mi64 in %o2   2nd ASSIGNMENT
566	and	%o4, xffff, %o5		C (x & 0xffff)
567	srlx	%o2, 48, %o7		C (mi64 >> 48)
568	sllx	%o2, 16, %i3		C (mi64 << 16)
569	add	%o7, %o1, cy		C new cy
570	or	%i3, %o5, %o5
571	stx	%o5, [%i4+%i2]
572
573	sllx	i00, 0, %g2
574	add	%g2, cy, cy
575	sllx	i16, 16, %g3
576	add	%g3, cy, cy
577
578	return	%i7+8
579	mov	cy, %o0
580EPILOGUE(mpn_mul_1)
581