xref: /inferno-os/libkern/memmove-thumb.s (revision 37da2899f40661e3e9631e497da8dc59b971cbd0)
1TS = 0
2TE = 1
3FROM = 2
4N = 3
5TMP = 3					/* N and TMP don't overlap */
6TMP1 = 4
7
8TEXT	memcpy(SB), $0
9TEXT memmove(SB), $-4
10_memmove:
11	MOVW	R(TS), to+0(FP)		/* need to save for return value */
12	MOVW	from+4(FP), R(FROM)
13	MOVW	n+8(FP), R(N)
14
15	ADD	R(N), R(TS), R(TE)	/* to end pointer */
16
17	CMP	R(FROM), R(TS)
18	BLS	_forward
19
20_back:
21	ADD	R(N), R(FROM)		/* from end pointer */
22	CMP	$4, R(N)		/* need at least 4 bytes to copy */
23	BLT	_b1tail
24
25_b4align:				/* align destination on 4 */
26	AND.S	$3, R(TE), R(TMP)
27	BEQ	_b4aligned
28
29	MOVBU.W	-1(R(FROM)), R(TMP)	/* pre-indexed */
30	MOVBU.W	R(TMP), -1(R(TE))	/* pre-indexed */
31	B	_b4align
32
33_b4aligned:				/* is source now aligned? */
34	AND.S	$3, R(FROM), R(TMP)
35	BNE	_bunaligned
36
37	ADD	$31, R(TS), R(TMP)	/* do 32-byte chunks if possible */
38_b32loop:
39	CMP	R(TMP), R(TE)
40	BLS	_b4tail
41
42	MOVM.DB.W (R(FROM)), [R4-R11]
43	MOVM.DB.W [R4-R11], (R(TE))
44	B	_b32loop
45
46_b4tail:				/* do remaining words if possible */
47	ADD	$3, R(TS), R(TMP)
48_b4loop:
49	CMP	R(TMP), R(TE)
50	BLS	_b1tail
51
52	MOVW.W	-4(R(FROM)), R(TMP1)	/* pre-indexed */
53	MOVW.W	R(TMP1), -4(R(TE))	/* pre-indexed */
54	B	_b4loop
55
56_b1tail:				/* remaining bytes */
57	CMP	R(TE), R(TS)
58	BEQ	_return
59
60	MOVBU.W	-1(R(FROM)), R(TMP)	/* pre-indexed */
61	MOVBU.W	R(TMP), -1(R(TE))	/* pre-indexed */
62	B	_b1tail
63
64_forward:
65	CMP	$4, R(N)		/* need at least 4 bytes to copy */
66	BLT	_f1tail
67
68_f4align:				/* align destination on 4 */
69	AND.S	$3, R(TS), R(TMP)
70	BEQ	_f4aligned
71
72	MOVBU.P	1(R(FROM)), R(TMP)	/* implicit write back */
73	MOVBU.P	R(TMP), 1(R(TS))	/* implicit write back */
74	B	_f4align
75
76_f4aligned:				/* is source now aligned? */
77	AND.S	$3, R(FROM), R(TMP)
78	BNE	_funaligned
79
80	SUB	$31, R(TE), R(TMP)	/* do 32-byte chunks if possible */
81_f32loop:
82	CMP	R(TMP), R(TS)
83	BHS	_f4tail
84
85	MOVM.IA.W (R(FROM)), [R4-R11]
86	MOVM.IA.W [R4-R11], (R(TS))
87	B	_f32loop
88
89_f4tail:
90	SUB	$3, R(TE), R(TMP)	/* do remaining words if possible */
91_f4loop:
92	CMP	R(TMP), R(TS)
93	BHS	_f1tail
94
95	MOVW.P	4(R(FROM)), R(TMP1)	/* implicit write back */
96	MOVW.P	R4, 4(R(TS))		/* implicit write back */
97	B	_f4loop
98
99_f1tail:
100	CMP	R(TS), R(TE)
101	BEQ	_return
102
103	MOVBU.P	1(R(FROM)), R(TMP)	/* implicit write back */
104	MOVBU.P	R(TMP), 1(R(TS))	/* implicit write back */
105	B	_f1tail
106
107_return:
108	MOVW	to+0(FP), R0
109	RET
110
111RSHIFT = 4
112LSHIFT = 5
113OFFSET = 6
114
115BR0 = 7
116BW0 = 8
117BR1 = 8
118BW1 = 9
119BR2 = 9
120BW2 = 10
121BR3 = 10
122BW3 = 11
123
124_bunaligned:
125	CMP	$2, R(TMP)		/* is R(TMP) < 2 ? */
126
127	MOVW.LT	$8, R(RSHIFT)		/* (R(n)<<24)|(R(n-1)>>8) */
128	MOVW.LT	$24, R(LSHIFT)
129	MOVW.LT	$1, R(OFFSET)
130
131	MOVW.EQ	$16, R(RSHIFT)		/* (R(n)<<16)|(R(n-1)>>16) */
132	MOVW.EQ	$16, R(LSHIFT)
133	MOVW.EQ	$2, R(OFFSET)
134
135	MOVW.GT	$24, R(RSHIFT)		/* (R(n)<<8)|(R(n-1)>>24) */
136	MOVW.GT	$8, R(LSHIFT)
137	MOVW.GT	$3, R(OFFSET)
138
139	ADD	$16, R(TS), R(TMP)	/* do 16-byte chunks if possible */
140	CMP	R(TMP), R(TE)
141	BLS	_b1tail
142
143	AND	$~0x03, R(FROM)		/* align source */
144	MOVW	(R(FROM)), R(BR0)	/* prime first block register */
145
146_bu16loop:
147	CMP	R(TMP), R(TE)
148	BLS	_bu1tail
149
150	MOVW	R(BR0)<<R(LSHIFT), R(BW3)
151	MOVM.DB.W (R(FROM)), [R(BR0)-R(BR3)]
152	ORR	R(BR3)>>R(RSHIFT), R(BW3)
153
154	MOVW	R(BR3)<<R(LSHIFT), R(BW2)
155	ORR	R(BR2)>>R(RSHIFT), R(BW2)
156
157	MOVW	R(BR2)<<R(LSHIFT), R(BW1)
158	ORR	R(BR1)>>R(RSHIFT), R(BW1)
159
160	MOVW	R(BR1)<<R(LSHIFT), R(BW0)
161	ORR	R(BR0)>>R(RSHIFT), R(BW0)
162
163	MOVM.DB.W [R(BW0)-R(BW3)], (R(TE))
164	B	_bu16loop
165
166_bu1tail:
167	ADD	R(OFFSET), R(FROM)
168	B	_b1tail
169
170FW0 = 7
171FR0 = 8
172FW1 = 8
173FR1 = 9
174FW2 = 9
175FR2 = 10
176FW3 = 10
177FR3 = 11
178
179_funaligned:
180	CMP	$2, R(TMP)
181
182	MOVW.LT	$8, R(RSHIFT)		/* (R(n+1)<<24)|(R(n)>>8) */
183	MOVW.LT	$24, R(LSHIFT)
184	MOVW.LT	$3, R(OFFSET)
185
186	MOVW.EQ	$16, R(RSHIFT)		/* (R(n+1)<<16)|(R(n)>>16) */
187	MOVW.EQ	$16, R(LSHIFT)
188	MOVW.EQ	$2, R(OFFSET)
189
190	MOVW.GT	$24, R(RSHIFT)		/* (R(n+1)<<8)|(R(n)>>24) */
191	MOVW.GT	$8, R(LSHIFT)
192	MOVW.GT	$1, R(OFFSET)
193
194	SUB	$16, R(TE), R(TMP)	/* do 16-byte chunks if possible */
195	CMP	R(TMP), R(TS)
196	BHS	_f1tail
197
198	AND	$~0x03, R(FROM)		/* align source */
199	MOVW.P	4(R(FROM)), R(FR3)	/* prime last block register, implicit write back */
200
201_fu16loop:
202	CMP	R(TMP), R(TS)
203	BHS	_fu1tail
204
205	MOVW	R(FR3)>>R(RSHIFT), R(FW0)
206	MOVM.IA.W (R(FROM)), [R(FR0)-R(FR3)]
207	ORR	R(FR0)<<R(LSHIFT), R(FW0)
208
209	MOVW	R(FR0)>>R(RSHIFT), R(FW1)
210	ORR	R(FR1)<<R(LSHIFT), R(FW1)
211
212	MOVW	R(FR1)>>R(RSHIFT), R(FW2)
213	ORR	R(FR2)<<R(LSHIFT), R(FW2)
214
215	MOVW	R(FR2)>>R(RSHIFT), R(FW3)
216	ORR	R(FR3)<<R(LSHIFT), R(FW3)
217
218	MOVM.IA.W [R(FW0)-R(FW3)], (R(TS))
219	B	_fu16loop
220
221_fu1tail:
222	SUB	R(OFFSET), R(FROM)
223	B	_f1tail
224