xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/ia64/divrem_1.asm (revision a8c74629f602faa0ccf8a463757d7baf858bbf3a)
1dnl  IA-64 mpn_divrem_1 and mpn_preinv_divrem_1 -- Divide an mpn number by an
2dnl  unnormalized limb.
3
4dnl  Contributed to the GNU project by Torbjorn Granlund.
5
6dnl  Copyright 2002, 2004, 2005 Free Software Foundation, Inc.
7
8dnl  This file is part of the GNU MP Library.
9dnl
10dnl  The GNU MP Library is free software; you can redistribute it and/or modify
11dnl  it under the terms of either:
12dnl
13dnl    * the GNU Lesser General Public License as published by the Free
14dnl      Software Foundation; either version 3 of the License, or (at your
15dnl      option) any later version.
16dnl
17dnl  or
18dnl
19dnl    * the GNU General Public License as published by the Free Software
20dnl      Foundation; either version 2 of the License, or (at your option) any
21dnl      later version.
22dnl
23dnl  or both in parallel, as here.
24dnl
25dnl  The GNU MP Library is distributed in the hope that it will be useful, but
26dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
27dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
28dnl  for more details.
29dnl
30dnl  You should have received copies of the GNU General Public License and the
31dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
32dnl  see https://www.gnu.org/licenses/.
33
34include(`../config.m4')
35
36
37C         cycles/limb
38C Itanium:    40-42
39C Itanium 2:  29-30
40
41C This was generated by gcc, then the loops were optimized.  The preinv entry
42C point was shoehorned into the file.  Lots of things outside the loops could
43C be streamlined.  It would probably be a good idea to merge the loops for
44C normalized and unnormalized divisor, since the shifting stuff is done for
45C free in parallel with other operations.  It would even be possible to merge
46C all loops, if the ld8 were made conditional.
47
48C TODO
49C  * Consider delaying inversion for normalized mpn_divrem_1 entry till after
50C    computing leading limb.
51C  * Inline and interleave limb inversion code with loop setup code.
52
53ASM_START()
54
55C HP's assembler requires these declarations for importing mpn_invert_limb
56	.global	mpn_invert_limb
57	.type	mpn_invert_limb,@function
58
59C INPUT PARAMETERS
60C rp    = r32
61C qxn   = r33
62C up    = r34
63C n     = r35
64C vl    = r36
65C vlinv = r37  (preinv only)
66C cnt = r38    (preinv only)
67
68PROLOGUE(mpn_preinv_divrem_1)
69	.prologue
70	.save	ar.pfs, r42
71	alloc		r42 = ar.pfs, 7, 8, 1, 0
72	.save	ar.lc, r44
73	mov		r44 = ar.lc
74	.save	rp, r41
75	mov		r41 = b0
76	.body
77ifdef(`HAVE_ABI_32',
78`	addp4		r32 = 0, r32
79	sxt4		r33 = r33
80	addp4		r34 = 0, r34
81	sxt4		r35 = r35
82	;;
83')
84	mov		r40 = r38
85	shladd		r34 = r35, 3, r34
86	;;
87	adds		r34 = -8, r34
88	;;
89	ld8		r39 = [r34], -8
90	;;
91
92	add		r15 = r35, r33
93	;;
94	mov		r8 = r37
95	shladd		r32 = r15, 3, r32	C r32 = rp + n + qxn
96	cmp.le		p8, p0 = 0, r36
97	;;
98	adds		r32 = -8, r32		C r32 = rp + n + qxn - 1
99	cmp.leu		p6, p7 = r36, r39
100   (p8)	br.cond.dpnt	.Lpunnorm
101	;;
102
103   (p6)	addl		r15 = 1, r0
104   (p7)	mov		r15 = r0
105	;;
106   (p6)	sub		r38 = r39, r36
107   (p7)	mov		r38 = r39
108	st8		[r32] = r15, -8
109	adds		r35 = -2, r35		C un -= 2
110	br	.Lpn
111
112.Lpunnorm:
113   (p6)	add		r34 = 8, r34
114	mov		r38 = 0			C r = 0
115	shl		r36 = r36, r40
116   (p6)	br.cond.dptk	.Lpu
117	;;
118	shl		r38 = r39, r40		C r = ahigh << cnt
119	cmp.ne		p8, p0 = 1, r35
120	st8		[r32] = r0, -8
121	adds		r35 = -1, r35		C un--
122   (p8)	br.cond.dpnt	.Lpu
123
124	mov		r23 = 1
125	;;
126	setf.sig	f6 = r8
127	setf.sig	f12 = r23
128	br		.L435
129EPILOGUE()
130
131
132PROLOGUE(mpn_divrem_1)
133	.prologue
134	.save	ar.pfs, r42
135	alloc		r42 = ar.pfs, 5, 8, 1, 0
136	.save	ar.lc, r44
137	mov		r44 = ar.lc
138	.save	rp, r41
139	mov		r41 = b0
140	.body
141ifdef(`HAVE_ABI_32',
142`	addp4		r32 = 0, r32
143	sxt4		r33 = r33
144	addp4		r34 = 0, r34
145	sxt4		r35 = r35
146	;;
147')
148	mov		r38 = r0
149	add		r15 = r35, r33
150	;;
151	cmp.ne		p6, p7 = 0, r15
152	;;
153   (p7)	mov		r8 = r0
154   (p7)	br.cond.dpnt	.Lret
155	shladd		r14 = r15, 3, r32	C r14 = rp + n + qxn
156	cmp.le		p6, p7 = 0, r36
157	;;
158	adds		r32 = -8, r14		C r32 = rp + n + qxn - 1
159   (p6)	br.cond.dpnt	.Lunnorm
160	cmp.eq		p6, p7 = 0, r35
161   (p6)	br.cond.dpnt	.L179
162	shladd		r14 = r35, 3, r34
163	;;
164	adds		r14 = -8, r14
165	adds		r35 = -1, r35
166	;;
167	ld8		r38 = [r14]
168	;;
169	cmp.leu		p6, p7 = r36, r38
170	;;
171   (p6)	addl		r15 = 1, r0
172   (p7)	mov		r15 = r0
173	;;
174	st8		[r32] = r15, -8
175  (p6)	sub		r38 = r38, r36
176
177.L179:
178	mov		r45 = r36
179	adds		r35 = -1, r35
180	br.call.sptk.many b0 = mpn_invert_limb
181	;;
182	shladd		r34 = r35, 3, r34
183.Lpn:
184	mov		r23 = 1
185	;;
186	setf.sig	f6 = r8
187	setf.sig	f12 = r23
188	cmp.le		p6, p7 = 0, r35
189	mov		r40 = 0
190   (p7)	br.cond.dpnt	.L435
191	setf.sig	f10 = r36
192	mov		ar.lc = r35
193	setf.sig	f7 = r38
194	;;
195	sub		r28 = -1, r36
196C Develop quotient limbs for normalized divisor
197.Loop1:		C 00				C q=r18 nh=r38/f7
198	ld8		r20 = [r34], -8
199	xma.hu		f11 = f7, f6, f0
200	;;	C 04
201	xma.l		f8 = f11, f12, f7	C q = q + nh
202	;;	C 08
203	getf.sig	r18 = f8
204	xma.hu		f9 = f8, f10, f0
205	xma.l		f8 = f8, f10, f0
206	;;	C 12
207	getf.sig	r16 = f9
208		C 13
209	getf.sig	r15 = f8
210	;;	C 18
211	cmp.ltu		p6, p7 = r20, r15
212	sub		r15 = r20, r15
213	sub		r16 = r38, r16
214	;;	C 19
215   (p6)	cmp.ne		p8, p9 = 1, r16		C is rH != 0?
216   (p7)	cmp.ne		p8, p9 = 0, r16		C is rH != 0?
217   (p6)	add		r16 = -1, r16
218   (p0)	cmp.ne.unc	p6, p7 = r0, r0
219	;;	C 20
220   (p8)	cmp.ltu		p6, p7 = r15, r36
221   (p8)	sub		r15 = r15, r36
222   (p8)	add		r18 = 1, r18		C q = q + 1;	done if: rH > 0
223	;;	C 21
224	.pred.rel "mutex",p6,p7
225   (p6)	cmp.ne		p8, p9 = 1, r16		C is rH != 0 still?
226   (p7)	cmp.ne		p8, p9 = 0, r16		C is rH != 0 still?
227	cmp.ltu		p6, p7 = r15, r36	C speculative
228	sub		r28 = r15, r36		C speculative, just for cmp
229	;;	C 22
230   (p8)	cmp.ltu		p6, p7 = r28, r36	C redo last cmp if needed
231   (p8)	mov		r15 = r28
232   (p8)	add		r18 = 1, r18		C q = q + 1;	done if: rH > 0
233	;;	C 23
234   (p6)	setf.sig	f7 = r15
235   (p7)	sub		r15 = r15, r36
236   (p7)	add		r18 = 1, r18		C q = q + 1;	done if: rH > 0
237	;;	C 24
238   (p7)	setf.sig	f7 = r15
239	st8		[r32] = r18, -8
240	mov		r38 = r15
241	br.cloop.dptk	.Loop1
242		C 29/30
243	br.sptk		.L435
244	;;
245.Lunnorm:
246	mux1		r16 = r36, @rev
247	cmp.eq		p6, p7 = 0, r35
248   (p6)	br.cond.dpnt	.L322
249	shladd		r34 = r35, 3, r34
250	;;
251	adds		r34 = -8, r34
252	;;
253	ld8		r39 = [r34]
254	;;
255	cmp.leu		p6, p7 = r36, r39
256   (p6)	br.cond.dptk	.L322
257	adds		r34 = -8, r34
258	;;
259	mov		r38 = r39
260	;;
261	cmp.ne		p6, p7 = 1, r15
262	st8		[r32] = r0, -8
263	;;
264   (p7)	mov		r8 = r38
265   (p7)	br.cond.dpnt	.Lret
266	adds		r35 = -1, r35
267.L322:
268	sub		r14 = r0, r16
269	;;
270	or		r14 = r16, r14
271	;;
272	mov		r16 = -8
273	czx1.l		r14 = r14
274	;;
275	shladd		r16 = r14, 3, r16
276	;;
277	shr.u		r14 = r36, r16
278	;;
279	cmp.geu		p6, p7 = 15, r14
280	;;
281   (p7)	shr.u		r14 = r14, 4
282   (p7)	adds		r16 = 4, r16
283	;;
284	cmp.geu		p6, p7 = 3, r14
285	;;
286   (p7)	shr.u		r14 = r14, 2
287   (p7)	adds		r16 = 2, r16
288	;;
289	tbit.nz		p6, p7 = r14, 1
290	;;
291	.pred.rel "mutex",p6,p7
292  (p6)	sub		r40 = 62, r16
293  (p7)	sub		r40 = 63, r16
294	;;
295	shl		r45 = r36, r40
296	shl		r36 = r36, r40
297	shl		r38 = r38, r40
298	br.call.sptk.many b0 = mpn_invert_limb
299	;;
300.Lpu:
301	mov		r23 = 1
302	;;
303	setf.sig	f6 = r8
304	setf.sig	f12 = r23
305	cmp.eq		p6, p7 = 0, r35
306   (p6)	br.cond.dpnt	.L435
307	sub		r16 = 64, r40
308	adds		r35 = -2, r35
309	;;
310	ld8		r39 = [r34], -8
311	cmp.le		p6, p7 = 0, r35
312	;;
313	shr.u		r14 = r39, r16
314	;;
315	or		r38 = r14, r38
316   (p7)	br.cond.dpnt	.Lend3
317	;;
318	mov		r22 = r16
319	setf.sig	f10 = r36
320	setf.sig	f7 = r38
321	mov		ar.lc = r35
322	;;
323C Develop quotient limbs for unnormalized divisor
324.Loop3:
325	ld8		r14 = [r34], -8
326	xma.hu		f11 = f7, f6, f0
327	;;
328	xma.l		f8 = f11, f12, f7	C q = q + nh
329	;;
330	getf.sig	r18 = f8
331	xma.hu		f9 = f8, f10, f0
332	shl		r20 = r39, r40
333	xma.l		f8 = f8, f10, f0
334	shr.u		r24 = r14, r22
335	;;
336	getf.sig	r16 = f9
337	getf.sig	r15 = f8
338	or		r20 = r24, r20
339	;;
340	cmp.ltu		p6, p7 = r20, r15
341	sub		r15 = r20, r15
342	sub		r16 = r38, r16
343	;;
344   (p6)	cmp.ne		p8, p9 = 1, r16		C is rH != 0?
345   (p7)	cmp.ne		p8, p9 = 0, r16		C is rH != 0?
346   (p6)	add		r16 = -1, r16
347   (p0)	cmp.ne.unc	p6, p7 = r0, r0
348	;;
349   (p8)	cmp.ltu		p6, p7 = r15, r36
350   (p8)	sub		r15 = r15, r36
351   (p8)	add		r18 = 1, r18		C q = q + 1;	done if: rH > 0
352	;;
353	.pred.rel "mutex",p6,p7
354   (p6)	cmp.ne		p8, p9 = 1, r16		C is rH != 0 still?
355   (p7)	cmp.ne		p8, p9 = 0, r16		C is rH != 0 still?
356	cmp.ltu		p6, p7 = r15, r36	C speculative
357	sub		r28 = r15, r36		C speculative, just for cmp
358	;;
359   (p8)	cmp.ltu		p6, p7 = r28, r36	C redo last cmp if needed
360   (p8)	mov		r15 = r28
361   (p8)	add		r18 = 1, r18		C q = q + 1;	done if: rH > 0
362	;;
363   (p6)	setf.sig	f7 = r15
364   (p7)	sub		r15 = r15, r36
365   (p7)	add		r18 = 1, r18		C q = q + 1;	done if: rH > 0
366	;;
367   (p7)	setf.sig	f7 = r15
368	st8		[r32] = r18, -8
369	mov		r39 = r14
370	mov		r38 = r15
371	br.cloop.dptk	.Loop3
372	;;
373.Lend3:
374	setf.sig	f10 = r36
375	setf.sig	f7 = r38
376	;;
377	xma.hu		f11 = f7, f6, f0
378	;;
379	xma.l		f8 = f11, f12, f7	C q = q + nh
380	;;
381	getf.sig	r18 = f8
382	xma.hu		f9 = f8, f10, f0
383	shl		r20 = r39, r40
384	xma.l		f8 = f8, f10, f0
385	;;
386	getf.sig	r16 = f9
387	getf.sig	r15 = f8
388	;;
389	cmp.ltu		p6, p7 = r20, r15
390	sub		r15 = r20, r15
391	sub		r16 = r38, r16
392	;;
393   (p6)	cmp.ne		p8, p9 = 1, r16		C is rH != 0?
394   (p7)	cmp.ne		p8, p9 = 0, r16		C is rH != 0?
395   (p6)	add		r16 = -1, r16
396   (p0)	cmp.ne.unc	p6, p7 = r0, r0
397	;;
398   (p8)	cmp.ltu		p6, p7 = r15, r36
399   (p8)	sub		r15 = r15, r36
400   (p8)	add		r18 = 1, r18		C q = q + 1;	done if: rH > 0
401	;;
402	.pred.rel "mutex",p6,p7
403   (p6)	cmp.ne		p8, p9 = 1, r16		C is rH != 0 still?
404   (p7)	cmp.ne		p8, p9 = 0, r16		C is rH != 0 still?
405	;;
406   (p8)	sub		r15 = r15, r36
407   (p8)	add		r18 = 1, r18		C q = q + 1;	done if: rH > 0
408	;;
409	cmp.ltu		p6, p7 = r15, r36
410	;;
411   (p7)	sub		r15 = r15, r36
412   (p7)	add		r18 = 1, r18		C q = q + 1;	done if: rH > 0
413	;;
414	st8		[r32] = r18, -8
415	mov		r38 = r15
416.L435:
417	adds		r35 = -1, r33
418	cmp.le		p6, p7 = 1, r33
419   (p7)	br.cond.dpnt	.Lend4
420	;;
421	setf.sig	f7 = r38
422	setf.sig	f10 = r36
423	mov		ar.lc = r35
424	;;
425.Loop4:
426	xma.hu		f11 = f7, f6, f0
427	;;
428	xma.l		f8 = f11, f12, f7	C q = q + nh
429	;;
430	getf.sig	r18 = f8
431	xma.hu		f9 = f8, f10, f0
432	xma.l		f8 = f8, f10, f0
433	;;
434	getf.sig	r16 = f9
435	getf.sig	r15 = f8
436	;;
437	cmp.ltu		p6, p7 = 0, r15
438	sub		r15 = 0, r15
439	sub		r16 = r38, r16
440	;;
441   (p6)	cmp.ne		p8, p9 = 1, r16		C is rH != 0?
442   (p7)	cmp.ne		p8, p9 = 0, r16		C is rH != 0?
443   (p6)	add		r16 = -1, r16
444   (p0)	cmp.ne.unc	p6, p7 = r0, r0
445	;;
446   (p8)	cmp.ltu		p6, p7 = r15, r36
447   (p8)	sub		r15 = r15, r36
448   (p8)	add		r18 = 1, r18		C q = q + 1;	done if: rH > 0
449	;;
450	.pred.rel "mutex",p6,p7
451   (p6)	cmp.ne		p8, p9 = 1, r16		C is rH != 0 still?
452   (p7)	cmp.ne		p8, p9 = 0, r16		C is rH != 0 still?
453	cmp.ltu		p6, p7 = r15, r36	C speculative
454	sub		r28 = r15, r36		C speculative, just for cmp
455	;;
456   (p8)	cmp.ltu		p6, p7 = r28, r36	C redo last cmp if needed
457   (p8)	mov		r15 = r28
458   (p8)	add		r18 = 1, r18		C q = q + 1;	done if: rH > 0
459	;;
460   (p6)	setf.sig	f7 = r15
461   (p7)	sub		r15 = r15, r36
462   (p7)	add		r18 = 1, r18		C q = q + 1;	done if: rH > 0
463	;;
464   (p7)	setf.sig	f7 = r15
465	st8		[r32] = r18, -8
466	mov		r38 = r15
467	br.cloop.dptk	.Loop4
468	;;
469.Lend4:
470	shr.u		r8 = r38, r40
471.Lret:
472	mov		ar.pfs = r42
473	mov		ar.lc = r44
474	mov		b0 = r41
475	br.ret.sptk.many b0
476EPILOGUE()
477ASM_END()
478