xref: /netbsd-src/external/gpl3/binutils/dist/ld/emultempl/spu_ovl.S (revision 4fee23f98c45552038ad6b5bd05124a41302fb01)
1/* Overlay manager for SPU.
2
3   Copyright 2006, 2007, 2008 Free Software Foundation, Inc.
4
5   This file is part of the GNU Binutils.
6
7   This program is free software; you can redistribute it and/or modify
8   it under the terms of the GNU General Public License as published by
9   the Free Software Foundation; either version 3 of the License, or
10   (at your option) any later version.
11
12   This program is distributed in the hope that it will be useful,
13   but WITHOUT ANY WARRANTY; without even the implied warranty of
14   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15   GNU General Public License for more details.
16
17   You should have received a copy of the GNU General Public License
18   along with this program; if not, write to the Free Software
19   Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston,
20   MA 02110-1301, USA.  */
21
22/* MFC DMA defn's.  */
23#define MFC_GET_CMD		0x40
24#define MFC_MAX_DMA_SIZE	0x4000
25#define MFC_TAG_UPDATE_ALL	2
26#define MFC_TAG_ID		0
27
28/* Register usage.  */
29#define reserved1	$75
30#define parm		$75
31#define tab1		reserved1
32#define tab2		reserved1
33#define vma		reserved1
34#define oldvma		reserved1
35#define newmask		reserved1
36#define map		reserved1
37
38#define reserved2	$76
39#define off1		reserved2
40#define off2		reserved2
41#define present1	reserved2
42#define present2	reserved2
43#define sz		reserved2
44#define cmp		reserved2
45#define add64		reserved2
46#define cgbits		reserved2
47#define off3		reserved2
48#define off4		reserved2
49#define addr4		reserved2
50#define off5		reserved2
51#define tagstat		reserved2
52
53#define reserved3	$77
54#define size1		reserved3
55#define size2		reserved3
56#define rv3		reserved3
57#define ealo		reserved3
58#define cmd		reserved3
59#define off64		reserved3
60#define tab3		reserved3
61#define tab4		reserved3
62#define tab5		reserved3
63
64#define reserved4	$78
65#define ovl		reserved4
66#define rv2		reserved4
67#define rv5		reserved4
68#define cgshuf		reserved4
69#define newovl		reserved4
70#define irqtmp1		reserved4
71#define irqtmp2		reserved4
72
73#define reserved5	$79
74#define target		reserved5
75
76#define save1		$74
77#define rv4		save1
78#define rv7		save1
79#define tagid		save1
80#define maxsize		save1
81#define pbyte		save1
82#define pbit		save1
83
84#define save2		$73
85#define cur		save2
86#define rv6		save2
87#define osize		save2
88#define zovl		save2
89#define oldovl		save2
90#define newvma		save2
91
92#define save3		$72
93#define rv1		save3
94#define ea64		save3
95#define buf3		save3
96#define genwi		save3
97#define newmap		save3
98#define oldmask		save3
99
100#define save4		$71
101#define irq_stat	save4
102
103	.text
104	.align 	4
105	.type	__rv_pattern, @object
106	.size	__rv_pattern, 16
107__rv_pattern:
108	.word	0x00010203, 0x10111213, 0x80808080, 0x80808080
109
110	.type	__cg_pattern, @object
111	.size	__cg_pattern, 16
112__cg_pattern:
113	.word	0x04050607, 0x80808080, 0x80808080, 0x80808080
114
115	.type	__ovly_current, @object
116	.size	__ovly_current, 16
117__ovly_current:
118	.space	16
119
120/*
121 * __ovly_return - stub for returning from overlay functions.
122 *
123 * On entry the four slots of $lr are:
124 *   __ovly_return, prev ovl index, caller return addr, undefined.
125 *
126 * Load the previous overlay and jump to the caller return address.
127 * Updates __ovly_current.
128 */
129	.align 	4
130	.global	__ovly_return
131	.type	__ovly_return, @function
132__ovly_return:
133	ila	tab1, _ovly_table - 16				# 0,2	0
134	shlqbyi	ovl, $lr, 4					# 1,4	0
135#nop
136	shlqbyi	target, $lr, 8					# 1,4	1
137#nop; lnop
138#nop; lnop
139	shli	off1, ovl, 4					# 0,4	4
140#lnop
141#nop
142	hbr	ovly_ret9, target				# 1,15	5
143#nop; lnop
144#nop; lnop
145#nop
146	lqx	vma, tab1, off1					# 1,6	8
147#ifdef OVLY_IRQ_SAVE
148	nop
149	stqd	save4, -64($sp)					# 1,6	9
150#else
151#nop; lnop
152#endif
153#nop; lnop
154#nop; lnop
155#nop; lnop
156#nop; lnop
157#nop
158	rotqbyi	size1, vma, 4					# 1,4	14
159#nop
160	stqd	save3, -48($sp)					# 1,6	15
161#nop
162	stqd	save2, -32($sp)					# 1,6	16
163#nop
164	stqd	save1, -16($sp)					# 1,6	17
165	andi	present1, size1, 1				# 0,2	18
166	stqr	ovl, __ovly_current				# 1,6	18
167#nop; lnop
168#nop
169	brz	present1, do_load				# 1,4	20
170ovly_ret9:
171#nop
172	bi	target						# 1,4	21
173
174/*
175 * __ovly_load - copy an overlay partion to local store.
176 *
177 * On entry $75 points to a word consisting of the overlay index in
178 * the top 14 bits, and the target address in the bottom 18 bits.
179 *
180 * Sets up $lr to return via __ovly_return.  If $lr is already set
181 * to return via __ovly_return, don't change it.  In that case we
182 * have a tail call from one overlay function to another.
183 * Updates __ovly_current.
184 */
185	.align  3
186	.global	__ovly_load
187	.type	__ovly_load, @function
188__ovly_load:
189#if OVL_STUB_SIZE == 8
190########
191#nop
192	lqd	target, 0(parm)					# 1,6	-11
193#nop; lnop
194#nop; lnop
195#nop; lnop
196#nop; lnop
197#nop; lnop
198#nop
199	rotqby	target, target, parm				# 1,4	-5
200	ila	tab2, _ovly_table - 16				# 0,2	-4
201	stqd	save3, -48($sp)					# 1,6	-4
202#nop
203	stqd	save2, -32($sp)					# 1,6	-3
204#nop
205	stqd	save1, -16($sp)					# 1,6	-2
206	rotmi	ovl, target, -18				# 0,4	-1
207	hbr	ovly_load9, target				# 1,15	-1
208	ila	rv1, __ovly_return				# 0,2	0
209#lnop
210#nop; lnop
211#nop
212	lqr	cur, __ovly_current				# 1,6	2
213	shli	off2, ovl, 4					# 0,4	3
214	stqr	ovl, __ovly_current				# 1,6	3
215	ceq	rv2, $lr, rv1					# 0,2	4
216	lqr	rv3, __rv_pattern				# 1,6	4
217#nop; lnop
218#nop; lnop
219#nop
220	lqx	vma, tab2, off2					# 1,6	7
221########
222#else /* OVL_STUB_SIZE == 16 */
223########
224	ila	tab2, _ovly_table - 16				# 0,2	0
225	stqd	save3, -48($sp)					# 1,6	0
226	ila	rv1, __ovly_return				# 0,2	1
227	stqd	save2, -32($sp)					# 1,6	1
228	shli	off2, ovl, 4					# 0,4	2
229	lqr	cur, __ovly_current				# 1,6	2
230	nop
231	stqr	ovl, __ovly_current				# 1,6	3
232	ceq	rv2, $lr, rv1					# 0,2	4
233	lqr	rv3, __rv_pattern				# 1,6	4
234#nop
235	hbr	ovly_load9, target				# 1,15	5
236#nop
237	lqx	vma, tab2, off2					# 1,6	6
238#nop
239	stqd	save1, -16($sp)					# 1,6	7
240########
241#endif
242
243#nop; lnop
244#nop; lnop
245#nop
246	shufb	rv4, rv1, cur, rv3				# 1,4	10
247#nop
248	fsmb	rv5, rv2					# 1,4	11
249#nop
250	rotqmbyi rv6, $lr, -8					# 1,4	12
251#nop
252	rotqbyi	size2, vma, 4					# 1,4	13
253#nop
254	lqd	save3, -48($sp)					# 1,6	14
255#nop; lnop
256	or	rv7, rv4, rv6					# 0,2	16
257	lqd	save2, -32($sp)					# 1,6	16
258	andi	present2, size2, 1				# 0,2	17
259#ifdef OVLY_IRQ_SAVE
260	stqd	save4, -64($sp)					# 1,6	17
261#else
262	lnop							# 1,0	17
263#endif
264	selb	$lr, rv7, $lr, rv5				# 0,2	18
265	lqd	save1, -16($sp)					# 1,6	18
266#nop
267	brz	present2, do_load				# 1,4	19
268ovly_load9:
269#nop
270	bi	target						# 1,4	20
271
272/* If we get here, we are about to load a new overlay.
273 * "vma" contains the relevant entry from _ovly_table[].
274 *	extern struct {
275 *		u32 vma;
276 *		u32 size;
277 *		u32 file_offset;
278 *		u32 buf;
279 *	} _ovly_table[];
280 */
281	.align  3
282	.global	__ovly_load_event
283	.type	__ovly_load_event, @function
284__ovly_load_event:
285do_load:
286#ifdef OVLY_IRQ_SAVE
287	ila	irqtmp1, do_load10				# 0,2	-5
288	rotqbyi	sz, vma, 8					# 1,4	-5
289#nop
290	rdch	irq_stat, $SPU_RdMachStat			# 1,6	-4
291#nop
292	bid	irqtmp1						# 1,4	-3
293do_load10:
294	nop
295#else
296#nop
297	rotqbyi	sz, vma, 8					# 1,4	0
298#endif
299	rotqbyi	osize, vma, 4					# 1,4	1
300#nop
301	lqa	ea64, _EAR_					# 1,6	2
302#nop
303	lqr	cgshuf, __cg_pattern				# 1,6	3
304
305/* We could predict the branch at the end of this loop by adding a few
306   instructions, and there are plenty of free cycles to do so without
307   impacting loop execution time.  However, it doesn't make a great
308   deal of sense since we need to wait for the dma to complete anyway.  */
309__ovly_xfer_loop:
310#nop
311	rotqmbyi off64, sz, -4					# 1,4	4
312#nop; lnop
313#nop; lnop
314#nop; lnop
315	cg	cgbits, ea64, off64				# 0,2	8
316#lnop
317#nop; lnop
318#nop
319	shufb	add64, cgbits, cgbits, cgshuf			# 1,4	10
320#nop; lnop
321#nop; lnop
322#nop; lnop
323	addx	add64, ea64, off64				# 0,2	14
324#lnop
325	ila	maxsize, MFC_MAX_DMA_SIZE			# 0,2	15
326	lnop
327	ori	ea64, add64, 0					# 0,2	16
328	rotqbyi	ealo, add64, 4					# 1,4	16
329	cgt	cmp, osize, maxsize				# 0,2	17
330	wrch	$MFC_LSA, vma					# 1,6	17
331#nop; lnop
332	selb	sz, osize, maxsize, cmp				# 0,2	19
333	wrch	$MFC_EAH, ea64					# 1,6	19
334	ila	tagid, MFC_TAG_ID				# 0,2	20
335	wrch	$MFC_EAL, ealo					# 1,6	20
336	ila	cmd, MFC_GET_CMD				# 0,2	21
337	wrch	$MFC_Size, sz					# 1,6	21
338	sf	osize, sz, osize				# 0,2	22
339	wrch	$MFC_TagId, tagid				# 1,6	22
340	a	vma, vma, sz					# 0,2	23
341	wrch	$MFC_Cmd, cmd					# 1,6	23
342#nop
343	brnz	osize, __ovly_xfer_loop				# 1,4	24
344
345/* Now update our data structions while waiting for DMA to complete.
346   Low bit of .size needs to be cleared on the _ovly_table entry
347   corresponding to the evicted overlay, and set on the entry for the
348   newly loaded overlay.  Note that no overlay may in fact be evicted
349   as _ovly_buf_table[] starts with all zeros.  Don't zap .size entry
350   for zero index!  Also of course update the _ovly_buf_table entry.  */
351#nop
352	lqr	newovl, __ovly_current				# 1,6	25
353#nop; lnop
354#nop; lnop
355#nop; lnop
356#nop; lnop
357#nop; lnop
358	shli	off3, newovl, 4					# 0,4	31
359#lnop
360	ila	tab3, _ovly_table - 16				# 0,2	32
361#lnop
362#nop
363	fsmbi	pbyte, 0x100					# 1,4	33
364#nop; lnop
365#nop
366	lqx	vma, tab3, off3					# 1,6	35
367#nop; lnop
368	andi	pbit, pbyte, 1					# 0,2	37
369	lnop
370#nop; lnop
371#nop; lnop
372#nop; lnop
373	or	newvma, vma, pbit				# 0,2	41
374	rotqbyi	buf3, vma, 12					# 1,4	41
375#nop; lnop
376#nop
377	stqx	newvma, tab3, off3				# 1,6	43
378#nop; lnop
379	shli	off4, buf3, 2					# 1,4	45
380#lnop
381	ila	tab4, _ovly_buf_table - 4			# 0,2	46
382#lnop
383#nop; lnop
384#nop; lnop
385#nop
386	lqx	map, tab4, off4					# 1,6	49
387#nop
388	cwx	genwi, tab4, off4				# 1,4	50
389	a	addr4, tab4, off4				# 0,2	51
390#lnop
391#nop; lnop
392#nop; lnop
393#nop; lnop
394#nop
395	rotqby	oldovl, map, addr4				# 1,4	55
396#nop
397	shufb	newmap, newovl, map, genwi			# 0,4	56
398#if MFC_TAG_ID < 16
399	ila	newmask, 1 << MFC_TAG_ID			# 0,2	57
400#else
401	ilhu	newmask, 1 << (MFC_TAG_ID - 16)			# 0,2	57
402#endif
403#lnop
404#nop; lnop
405#nop; lnop
406	stqd	newmap, 0(addr4)				# 1,6	60
407
408/* Save app's tagmask, wait for DMA complete, restore mask.  */
409	ila	tagstat, MFC_TAG_UPDATE_ALL			# 0,2	61
410	rdch	oldmask, $MFC_RdTagMask				# 1,6	61
411#nop
412	wrch	$MFC_WrTagMask, newmask				# 1,6	62
413#nop
414	wrch	$MFC_WrTagUpdate, tagstat			# 1,6	63
415#nop
416	rdch	tagstat, $MFC_RdTagStat				# 1,6	64
417#nop
418	sync							# 1,4	65
419/* Any hint prior to the sync is lost.  A hint here allows the branch
420   to complete 15 cycles after the hint.  With no hint the branch will
421   take 18 or 19 cycles.  */
422	ila	tab5, _ovly_table - 16				# 0,2	66
423	hbr	do_load99, target				# 1,15	66
424	shli	off5, oldovl, 4					# 0,4	67
425	wrch	$MFC_WrTagMask, oldmask				# 1,6	67
426	ceqi	zovl, oldovl, 0					# 0,2	68
427#lnop
428#nop; lnop
429#nop
430	fsm	zovl, zovl					# 1,4	70
431#nop
432	lqx	oldvma, tab5, off5				# 1,6	71
433#nop
434	lqd	save3, -48($sp)					# 1,6	72
435#nop; lnop
436	andc	pbit, pbit, zovl				# 0,2	74
437	lqd	save2, -32($sp)					# 1,6	74
438#ifdef OVLY_IRQ_SAVE
439	ila	irqtmp2, do_load90				# 0,2	75
440#lnop
441	andi	irq_stat, irq_stat, 1				# 0,2	76
442#lnop
443#else
444#nop; lnop
445#nop; lnop
446#endif
447	andc	oldvma, oldvma, pbit				# 0,2	77
448	lqd	save1, -16($sp)					# 1,6	77
449	nop	       						# 0,0	78
450#lnop
451#nop
452	stqx	oldvma, tab5, off5				# 1,6	79
453#nop
454#ifdef OVLY_IRQ_SAVE
455	binze	irq_stat, irqtmp2				# 1,4	80
456do_load90:
457#nop
458	lqd	save4, -64($sp)					# 1,6	84
459#else
460#nop; lnop
461#endif
462
463	.global	_ovly_debug_event
464	.type	_ovly_debug_event, @function
465_ovly_debug_event:
466	nop
467/* Branch to target address. */
468do_load99:
469	bi	target						# 1,4	81/85
470
471	.size	__ovly_load, . - __ovly_load
472