xref: /plan9/sys/src/ape/lib/ap/power/memset.s (revision 7dd7cddf99dd7472612f1413b4da293630e6b1bc)
1	TEXT	memset(SB),$0
2#define	BDNZ	BC	16,0,
3	MOVW R3, p+0(FP)		/* R3 is pointer */
4
5/*
6 * performance:
7 *	about 100mbytes/sec (8k blocks) on a 603/105 without L2 cache
8 *	drops to 40mbytes/sec (10k blocks) and 28mbytes/sec with 32k blocks
9 */
10
11	MOVW	n+8(FP), R4		/* R4 is count */
12	CMP	R4, $0
13	BLE	ret
14	MOVW	c+4(FP), R5		/* R5 is char */
15
16/*
17 * create 16 copies of c in R5 .. R8
18 */
19	RLWNM	$0, R5, $0xff, R5
20	RLWMI	$8, R5, $0xff00, R5
21	RLWMI	$16, R5, $0xffff0000, R5
22	MOVW	R5, R6
23	MOVW	R5, R7
24	MOVW	R5, R8
25
26/*
27 * let STSW do the work for 16 characters or less; aligned and unaligned
28 */
29	CMP	R4, $16
30	BLE	out
31
32/*
33 * store enough bytes to align pointer
34 */
35	ANDCC	$7,R3, R9
36	BEQ	l2
37	SUBC	R9, $8, R9
38	MOVW	R9, XER
39	STSW	R5, (R3)
40	ADD	R9, R3
41	SUB	R9, R4
42
43/*
44 * store 16 at a time while there's room
45 * STSW was used here originally, but it's `completion serialised'
46 */
47l2:
48	SRAWCC	$4, R4, R9
49	BLE	out
50	MOVW	R9, CTR
51l3:
52	MOVW	R5, 0(R3)
53	ADD	$8, R3, R10
54	MOVW	R6, 4(R3)
55	MOVW	R7, 0(R10)
56	ADD	$8, R10, R3
57	MOVW	R8, 4(R10)
58	BDNZ	l3
59	RLWNMCC	$0, R4, $15, R4	/* residue */
60	BEQ	ret
61
62/*
63 * store up to 16 bytes from R5 .. R8; aligned and unaligned
64 */
65
66out:
67	MOVW	R4, XER
68	STSW	R5, (R3)
69
70ret:
71	MOVW	0(FP), R3
72	RETURN
73	END
74