xref: /inferno-os/libkern/memset-power.s (revision 37da2899f40661e3e9631e497da8dc59b971cbd0)
1*37da2899SCharles.Forsyth	TEXT	memset(SB),$0
2*37da2899SCharles.Forsyth#define	BDNZ	BC	16,0,
3*37da2899SCharles.Forsyth	MOVW R3, p+0(FP)		/* R3 is pointer */
4*37da2899SCharles.Forsyth
5*37da2899SCharles.Forsyth/*
6*37da2899SCharles.Forsyth * performance:
7*37da2899SCharles.Forsyth *	about 100mbytes/sec (8k blocks) on a 603/105 without L2 cache
8*37da2899SCharles.Forsyth *	drops to 40mbytes/sec (10k blocks) and 28mbytes/sec with 32k blocks
9*37da2899SCharles.Forsyth */
10*37da2899SCharles.Forsyth
11*37da2899SCharles.Forsyth	MOVW	n+8(FP), R4		/* R4 is count */
12*37da2899SCharles.Forsyth	CMP	R4, $0
13*37da2899SCharles.Forsyth	BLE	ret
14*37da2899SCharles.Forsyth	MOVW	c+4(FP), R5		/* R5 is char */
15*37da2899SCharles.Forsyth
16*37da2899SCharles.Forsyth/*
17*37da2899SCharles.Forsyth * create 16 copies of c in R5 .. R8
18*37da2899SCharles.Forsyth */
19*37da2899SCharles.Forsyth	RLWNM	$0, R5, $0xff, R5
20*37da2899SCharles.Forsyth	RLWMI	$8, R5, $0xff00, R5
21*37da2899SCharles.Forsyth	RLWMI	$16, R5, $0xffff0000, R5
22*37da2899SCharles.Forsyth	MOVW	R5, R6
23*37da2899SCharles.Forsyth	MOVW	R5, R7
24*37da2899SCharles.Forsyth	MOVW	R5, R8
25*37da2899SCharles.Forsyth
26*37da2899SCharles.Forsyth/*
27*37da2899SCharles.Forsyth * let STSW do the work for 16 characters or less; aligned and unaligned
28*37da2899SCharles.Forsyth */
29*37da2899SCharles.Forsyth	CMP	R4, $16
30*37da2899SCharles.Forsyth	BLE	out
31*37da2899SCharles.Forsyth
32*37da2899SCharles.Forsyth/*
33*37da2899SCharles.Forsyth * store enough bytes to align pointer
34*37da2899SCharles.Forsyth */
35*37da2899SCharles.Forsyth	ANDCC	$7,R3, R9
36*37da2899SCharles.Forsyth	BEQ	l2
37*37da2899SCharles.Forsyth	SUBC	R9, $8, R9
38*37da2899SCharles.Forsyth	MOVW	R9, XER
39*37da2899SCharles.Forsyth	STSW	R5, (R3)
40*37da2899SCharles.Forsyth	ADD	R9, R3
41*37da2899SCharles.Forsyth	SUB	R9, R4
42*37da2899SCharles.Forsyth
43*37da2899SCharles.Forsyth/*
44*37da2899SCharles.Forsyth * store 16 at a time while there's room
45*37da2899SCharles.Forsyth * STSW was used here originally, but it's `completion serialised'
46*37da2899SCharles.Forsyth */
47*37da2899SCharles.Forsythl2:
48*37da2899SCharles.Forsyth	SRAWCC	$4, R4, R9
49*37da2899SCharles.Forsyth	BLE	out
50*37da2899SCharles.Forsyth	MOVW	R9, CTR
51*37da2899SCharles.Forsythl3:
52*37da2899SCharles.Forsyth	MOVW	R5, 0(R3)
53*37da2899SCharles.Forsyth	ADD	$8, R3, R10
54*37da2899SCharles.Forsyth	MOVW	R6, 4(R3)
55*37da2899SCharles.Forsyth	MOVW	R7, 0(R10)
56*37da2899SCharles.Forsyth	ADD	$8, R10, R3
57*37da2899SCharles.Forsyth	MOVW	R8, 4(R10)
58*37da2899SCharles.Forsyth	BDNZ	l3
59*37da2899SCharles.Forsyth	RLWNMCC	$0, R4, $15, R4	/* residue */
60*37da2899SCharles.Forsyth	BEQ	ret
61*37da2899SCharles.Forsyth
62*37da2899SCharles.Forsyth/*
63*37da2899SCharles.Forsyth * store up to 16 bytes from R5 .. R8; aligned and unaligned
64*37da2899SCharles.Forsyth */
65*37da2899SCharles.Forsyth
66*37da2899SCharles.Forsythout:
67*37da2899SCharles.Forsyth	MOVW	R4, XER
68*37da2899SCharles.Forsyth	STSW	R5, (R3)
69*37da2899SCharles.Forsyth
70*37da2899SCharles.Forsythret:
71*37da2899SCharles.Forsyth	MOVW	0(FP), R3
72*37da2899SCharles.Forsyth	RETURN
73*37da2899SCharles.Forsyth	END
74