xref: /plan9/sys/src/libc/arm/memmove.s (revision 6ca8a7e31df69aa273b57c9a8727a3aa1ead4d4b)
1TS = 0
2TE = 1
3FROM = 2
4N = 3
5TMP = 3					/* N and TMP don't overlap */
6TMP1 = 4
7
8TEXT memcpy(SB), $0
9	B	_memmove
10TEXT memmove(SB), $0
11_memmove:
12	MOVW	R(TS), to+0(FP)		/* need to save for return value */
13	MOVW	from+4(FP), R(FROM)
14	MOVW	n+8(FP), R(N)
15
16	ADD	R(N), R(TS), R(TE)	/* to end pointer */
17
18	CMP	R(FROM), R(TS)
19	BLS	_forward
20
21_back:
22	ADD	R(N), R(FROM)		/* from end pointer */
23	CMP	$4, R(N)		/* need at least 4 bytes to copy */
24	BLT	_b1tail
25
26_b4align:				/* align destination on 4 */
27	AND.S	$3, R(TE), R(TMP)
28	BEQ	_b4aligned
29
30	MOVBU.W	-1(R(FROM)), R(TMP)	/* pre-indexed */
31	MOVBU.W	R(TMP), -1(R(TE))	/* pre-indexed */
32	B	_b4align
33
34_b4aligned:				/* is source now aligned? */
35	AND.S	$3, R(FROM), R(TMP)
36	BNE	_bunaligned
37
38	ADD	$31, R(TS), R(TMP)	/* do 32-byte chunks if possible */
39_b32loop:
40	CMP	R(TMP), R(TE)
41	BLS	_b4tail
42
43	MOVM.DB.W (R(FROM)), [R4-R7]
44	MOVM.DB.W [R4-R7], (R(TE))
45	MOVM.DB.W (R(FROM)), [R4-R7]
46	MOVM.DB.W [R4-R7], (R(TE))
47	B	_b32loop
48
49_b4tail:				/* do remaining words if possible */
50	ADD	$3, R(TS), R(TMP)
51_b4loop:
52	CMP	R(TMP), R(TE)
53	BLS	_b1tail
54
55	MOVW.W	-4(R(FROM)), R(TMP1)	/* pre-indexed */
56	MOVW.W	R(TMP1), -4(R(TE))	/* pre-indexed */
57	B	_b4loop
58
59_b1tail:				/* remaining bytes */
60	CMP	R(TE), R(TS)
61	BEQ	_return
62
63	MOVBU.W	-1(R(FROM)), R(TMP)	/* pre-indexed */
64	MOVBU.W	R(TMP), -1(R(TE))	/* pre-indexed */
65	B	_b1tail
66
67_forward:
68	CMP	$4, R(N)		/* need at least 4 bytes to copy */
69	BLT	_f1tail
70
71_f4align:				/* align destination on 4 */
72	AND.S	$3, R(TS), R(TMP)
73	BEQ	_f4aligned
74
75	MOVBU.P	1(R(FROM)), R(TMP)	/* implicit write back */
76	MOVBU.P	R(TMP), 1(R(TS))	/* implicit write back */
77	B	_f4align
78
79_f4aligned:				/* is source now aligned? */
80	AND.S	$3, R(FROM), R(TMP)
81	BNE	_funaligned
82
83	SUB	$31, R(TE), R(TMP)	/* do 32-byte chunks if possible */
84_f32loop:
85	CMP	R(TMP), R(TS)
86	BHS	_f4tail
87
88	MOVM.IA.W (R(FROM)), [R4-R7]
89	MOVM.IA.W [R4-R7], (R(TS))
90	MOVM.IA.W (R(FROM)), [R4-R7]
91	MOVM.IA.W [R4-R7], (R(TS))
92	B	_f32loop
93
94_f4tail:
95	SUB	$3, R(TE), R(TMP)	/* do remaining words if possible */
96_f4loop:
97	CMP	R(TMP), R(TS)
98	BHS	_f1tail
99
100	MOVW.P	4(R(FROM)), R(TMP1)	/* implicit write back */
101	MOVW.P	R4, 4(R(TS))		/* implicit write back */
102	B	_f4loop
103
104_f1tail:
105	CMP	R(TS), R(TE)
106	BEQ	_return
107
108	MOVBU.P	1(R(FROM)), R(TMP)	/* implicit write back */
109	MOVBU.P	R(TMP), 1(R(TS))	/* implicit write back */
110	B	_f1tail
111
112_return:
113	MOVW	to+0(FP), R0
114	RET
115
116RSHIFT = 4
117LSHIFT = 5
118OFFSET = 11
119
120BR0 = 6
121BW0 = 7
122BR1 = 7
123BW1 = 8
124
125_bunaligned:
126	CMP	$2, R(TMP)		/* is R(TMP) < 2 ? */
127
128	MOVW.LT	$8, R(RSHIFT)		/* (R(n)<<24)|(R(n-1)>>8) */
129	MOVW.LT	$24, R(LSHIFT)
130	MOVW.LT	$1, R(OFFSET)
131
132	MOVW.EQ	$16, R(RSHIFT)		/* (R(n)<<16)|(R(n-1)>>16) */
133	MOVW.EQ	$16, R(LSHIFT)
134	MOVW.EQ	$2, R(OFFSET)
135
136	MOVW.GT	$24, R(RSHIFT)		/* (R(n)<<8)|(R(n-1)>>24) */
137	MOVW.GT	$8, R(LSHIFT)
138	MOVW.GT	$3, R(OFFSET)
139
140	ADD	$8, R(TS), R(TMP)	/* do 8-byte chunks if possible */
141	CMP	R(TMP), R(TE)
142	BLS	_b1tail
143
144	BIC	$3, R(FROM)		/* align source */
145	MOVW	(R(FROM)), R(BR0)	/* prime first block register */
146
147_bu8loop:
148	CMP	R(TMP), R(TE)
149	BLS	_bu1tail
150
151	MOVW	R(BR0)<<R(LSHIFT), R(BW1)
152	MOVM.DB.W (R(FROM)), [R(BR0)-R(BR1)]
153	ORR	R(BR1)>>R(RSHIFT), R(BW1)
154
155	MOVW	R(BR1)<<R(LSHIFT), R(BW0)
156	ORR	R(BR0)>>R(RSHIFT), R(BW0)
157
158	MOVM.DB.W [R(BW0)-R(BW1)], (R(TE))
159	B	_bu8loop
160
161_bu1tail:
162	ADD	R(OFFSET), R(FROM)
163	B	_b1tail
164
165RSHIFT = 4
166LSHIFT = 5
167OFFSET = 11
168
169FW0 = 6
170FR0 = 7
171FW1 = 7
172FR1 = 8
173
174_funaligned:
175	CMP	$2, R(TMP)
176
177	MOVW.LT	$8, R(RSHIFT)		/* (R(n+1)<<24)|(R(n)>>8) */
178	MOVW.LT	$24, R(LSHIFT)
179	MOVW.LT	$3, R(OFFSET)
180
181	MOVW.EQ	$16, R(RSHIFT)		/* (R(n+1)<<16)|(R(n)>>16) */
182	MOVW.EQ	$16, R(LSHIFT)
183	MOVW.EQ	$2, R(OFFSET)
184
185	MOVW.GT	$24, R(RSHIFT)		/* (R(n+1)<<8)|(R(n)>>24) */
186	MOVW.GT	$8, R(LSHIFT)
187	MOVW.GT	$1, R(OFFSET)
188
189	SUB	$8, R(TE), R(TMP)	/* do 8-byte chunks if possible */
190	CMP	R(TMP), R(TS)
191	BHS	_f1tail
192
193	BIC	$3, R(FROM)		/* align source */
194	MOVW.P	4(R(FROM)), R(FR1)	/* prime last block register, implicit write back */
195
196_fu8loop:
197	CMP	R(TMP), R(TS)
198	BHS	_fu1tail
199
200	MOVW	R(FR1)>>R(RSHIFT), R(FW0)
201	MOVM.IA.W (R(FROM)), [R(FR0)-R(FR1)]
202	ORR	R(FR0)<<R(LSHIFT), R(FW0)
203
204	MOVW	R(FR0)>>R(RSHIFT), R(FW1)
205	ORR	R(FR1)<<R(LSHIFT), R(FW1)
206
207	MOVM.IA.W [R(FW0)-R(FW1)], (R(TS))
208	B	_fu8loop
209
210_fu1tail:
211	SUB	R(OFFSET), R(FROM)
212	B	_f1tail
213