xref: /netbsd-src/common/lib/libc/arch/arm/string/memcpy_xscale.S (revision d48f14661dda8638fee055ba15d35bdfb29b9fa8)
1/*	$NetBSD: memcpy_xscale.S,v 1.1 2005/12/20 19:28:49 christos Exp $	*/
2
3/*
4 * Copyright 2003 Wasabi Systems, Inc.
5 * All rights reserved.
6 *
7 * Written by Steve C. Woodford for Wasabi Systems, Inc.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 *    must display the following acknowledgement:
19 *      This product includes software developed for the NetBSD Project by
20 *      Wasabi Systems, Inc.
21 * 4. The name of Wasabi Systems, Inc. may not be used to endorse
22 *    or promote products derived from this software without specific prior
23 *    written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
27 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
29 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
30 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
31 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
32 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
33 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
34 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
35 * POSSIBILITY OF SUCH DAMAGE.
36 */
37
38#include <machine/asm.h>
39
40/* LINTSTUB: Func: void *memcpy(void *dst, const void *src, size_t len) */
41ENTRY(memcpy)
42	pld	[r1]
43	cmp	r2, #0x0c
44	ble	.Lmemcpy_short		/* <= 12 bytes */
45	mov	r3, r0			/* We must not clobber r0 */
46
47	/* Word-align the destination buffer */
48	ands	ip, r3, #0x03		/* Already word aligned? */
49	beq	.Lmemcpy_wordaligned	/* Yup */
50	cmp	ip, #0x02
51	ldrb	ip, [r1], #0x01
52	sub	r2, r2, #0x01
53	strb	ip, [r3], #0x01
54	ldrleb	ip, [r1], #0x01
55	suble	r2, r2, #0x01
56	strleb	ip, [r3], #0x01
57	ldrltb	ip, [r1], #0x01
58	sublt	r2, r2, #0x01
59	strltb	ip, [r3], #0x01
60
61	/* Destination buffer is now word aligned */
62.Lmemcpy_wordaligned:
63	ands	ip, r1, #0x03		/* Is src also word-aligned? */
64	bne	.Lmemcpy_bad_align	/* Nope. Things just got bad */
65
66	/* Quad-align the destination buffer */
67	tst	r3, #0x07		/* Already quad aligned? */
68	ldrne	ip, [r1], #0x04
69	stmfd	sp!, {r4-r9}		/* Free up some registers */
70	subne	r2, r2, #0x04
71	strne	ip, [r3], #0x04
72
73	/* Destination buffer quad aligned, source is at least word aligned */
74	subs	r2, r2, #0x80
75	blt	.Lmemcpy_w_lessthan128
76
77	/* Copy 128 bytes at a time */
78.Lmemcpy_w_loop128:
79	ldr	r4, [r1], #0x04		/* LD:00-03 */
80	ldr	r5, [r1], #0x04		/* LD:04-07 */
81	pld	[r1, #0x18]		/* Prefetch 0x20 */
82	ldr	r6, [r1], #0x04		/* LD:08-0b */
83	ldr	r7, [r1], #0x04		/* LD:0c-0f */
84	ldr	r8, [r1], #0x04		/* LD:10-13 */
85	ldr	r9, [r1], #0x04		/* LD:14-17 */
86	strd	r4, [r3], #0x08		/* ST:00-07 */
87	ldr	r4, [r1], #0x04		/* LD:18-1b */
88	ldr	r5, [r1], #0x04		/* LD:1c-1f */
89	strd	r6, [r3], #0x08		/* ST:08-0f */
90	ldr	r6, [r1], #0x04		/* LD:20-23 */
91	ldr	r7, [r1], #0x04		/* LD:24-27 */
92	pld	[r1, #0x18]		/* Prefetch 0x40 */
93	strd	r8, [r3], #0x08		/* ST:10-17 */
94	ldr	r8, [r1], #0x04		/* LD:28-2b */
95	ldr	r9, [r1], #0x04		/* LD:2c-2f */
96	strd	r4, [r3], #0x08		/* ST:18-1f */
97	ldr	r4, [r1], #0x04		/* LD:30-33 */
98	ldr	r5, [r1], #0x04		/* LD:34-37 */
99	strd	r6, [r3], #0x08		/* ST:20-27 */
100	ldr	r6, [r1], #0x04		/* LD:38-3b */
101	ldr	r7, [r1], #0x04		/* LD:3c-3f */
102	strd	r8, [r3], #0x08		/* ST:28-2f */
103	ldr	r8, [r1], #0x04		/* LD:40-43 */
104	ldr	r9, [r1], #0x04		/* LD:44-47 */
105	pld	[r1, #0x18]		/* Prefetch 0x60 */
106	strd	r4, [r3], #0x08		/* ST:30-37 */
107	ldr	r4, [r1], #0x04		/* LD:48-4b */
108	ldr	r5, [r1], #0x04		/* LD:4c-4f */
109	strd	r6, [r3], #0x08		/* ST:38-3f */
110	ldr	r6, [r1], #0x04		/* LD:50-53 */
111	ldr	r7, [r1], #0x04		/* LD:54-57 */
112	strd	r8, [r3], #0x08		/* ST:40-47 */
113	ldr	r8, [r1], #0x04		/* LD:58-5b */
114	ldr	r9, [r1], #0x04		/* LD:5c-5f */
115	strd	r4, [r3], #0x08		/* ST:48-4f */
116	ldr	r4, [r1], #0x04		/* LD:60-63 */
117	ldr	r5, [r1], #0x04		/* LD:64-67 */
118	pld	[r1, #0x18]		/* Prefetch 0x80 */
119	strd	r6, [r3], #0x08		/* ST:50-57 */
120	ldr	r6, [r1], #0x04		/* LD:68-6b */
121	ldr	r7, [r1], #0x04		/* LD:6c-6f */
122	strd	r8, [r3], #0x08		/* ST:58-5f */
123	ldr	r8, [r1], #0x04		/* LD:70-73 */
124	ldr	r9, [r1], #0x04		/* LD:74-77 */
125	strd	r4, [r3], #0x08		/* ST:60-67 */
126	ldr	r4, [r1], #0x04		/* LD:78-7b */
127	ldr	r5, [r1], #0x04		/* LD:7c-7f */
128	strd	r6, [r3], #0x08		/* ST:68-6f */
129	strd	r8, [r3], #0x08		/* ST:70-77 */
130	subs	r2, r2, #0x80
131	strd	r4, [r3], #0x08		/* ST:78-7f */
132	bge	.Lmemcpy_w_loop128
133
134.Lmemcpy_w_lessthan128:
135	adds	r2, r2, #0x80		/* Adjust for extra sub */
136	ldmeqfd	sp!, {r4-r9}
137	bxeq	lr			/* Return now if done */
138	subs	r2, r2, #0x20
139	blt	.Lmemcpy_w_lessthan32
140
141	/* Copy 32 bytes at a time */
142.Lmemcpy_w_loop32:
143	ldr	r4, [r1], #0x04
144	ldr	r5, [r1], #0x04
145	pld	[r1, #0x18]
146	ldr	r6, [r1], #0x04
147	ldr	r7, [r1], #0x04
148	ldr	r8, [r1], #0x04
149	ldr	r9, [r1], #0x04
150	strd	r4, [r3], #0x08
151	ldr	r4, [r1], #0x04
152	ldr	r5, [r1], #0x04
153	strd	r6, [r3], #0x08
154	strd	r8, [r3], #0x08
155	subs	r2, r2, #0x20
156	strd	r4, [r3], #0x08
157	bge	.Lmemcpy_w_loop32
158
159.Lmemcpy_w_lessthan32:
160	adds	r2, r2, #0x20		/* Adjust for extra sub */
161	ldmeqfd	sp!, {r4-r9}
162	bxeq	lr			/* Return now if done */
163
164	and	r4, r2, #0x18
165	rsbs	r4, r4, #0x18
166	addne	pc, pc, r4, lsl #1
167	nop
168
169	/* At least 24 bytes remaining */
170	ldr	r4, [r1], #0x04
171	ldr	r5, [r1], #0x04
172	sub	r2, r2, #0x08
173	strd	r4, [r3], #0x08
174
175	/* At least 16 bytes remaining */
176	ldr	r4, [r1], #0x04
177	ldr	r5, [r1], #0x04
178	sub	r2, r2, #0x08
179	strd	r4, [r3], #0x08
180
181	/* At least 8 bytes remaining */
182	ldr	r4, [r1], #0x04
183	ldr	r5, [r1], #0x04
184	subs	r2, r2, #0x08
185	strd	r4, [r3], #0x08
186
187	/* Less than 8 bytes remaining */
188	ldmfd	sp!, {r4-r9}
189	bxeq	lr			/* Return now if done */
190	subs	r2, r2, #0x04
191	ldrge	ip, [r1], #0x04
192	strge	ip, [r3], #0x04
193	bxeq	lr			/* Return now if done */
194	addlt	r2, r2, #0x04
195	ldrb	ip, [r1], #0x01
196	cmp	r2, #0x02
197	ldrgeb	r2, [r1], #0x01
198	strb	ip, [r3], #0x01
199	ldrgtb	ip, [r1]
200	strgeb	r2, [r3], #0x01
201	strgtb	ip, [r3]
202	bx	lr
203
204
205/*
206 * At this point, it has not been possible to word align both buffers.
207 * The destination buffer is word aligned, but the source buffer is not.
208 */
209.Lmemcpy_bad_align:
210	stmfd	sp!, {r4-r7}
211	bic	r1, r1, #0x03
212	cmp	ip, #2
213	ldr	ip, [r1], #0x04
214	bgt	.Lmemcpy_bad3
215	beq	.Lmemcpy_bad2
216	b	.Lmemcpy_bad1
217
218.Lmemcpy_bad1_loop16:
219#ifdef __ARMEB__
220	mov	r4, ip, lsl #8
221#else
222	mov	r4, ip, lsr #8
223#endif
224	ldr	r5, [r1], #0x04
225	pld	[r1, #0x018]
226	ldr	r6, [r1], #0x04
227	ldr	r7, [r1], #0x04
228	ldr	ip, [r1], #0x04
229#ifdef __ARMEB__
230	orr	r4, r4, r5, lsr #24
231	mov	r5, r5, lsl #8
232	orr	r5, r5, r6, lsr #24
233	mov	r6, r6, lsl #8
234	orr	r6, r6, r7, lsr #24
235	mov	r7, r7, lsl #8
236	orr	r7, r7, ip, lsr #24
237#else
238	orr	r4, r4, r5, lsl #24
239	mov	r5, r5, lsr #8
240	orr	r5, r5, r6, lsl #24
241	mov	r6, r6, lsr #8
242	orr	r6, r6, r7, lsl #24
243	mov	r7, r7, lsr #8
244	orr	r7, r7, ip, lsl #24
245#endif
246	str	r4, [r3], #0x04
247	str	r5, [r3], #0x04
248	str	r6, [r3], #0x04
249	str	r7, [r3], #0x04
250.Lmemcpy_bad1:
251	subs	r2, r2, #0x10
252	bge	.Lmemcpy_bad1_loop16
253
254	adds	r2, r2, #0x10
255	ldmeqfd	sp!, {r4-r7}
256	bxeq	lr			/* Return now if done */
257	subs	r2, r2, #0x04
258	sublt	r1, r1, #0x03
259	blt	.Lmemcpy_bad_done
260
261.Lmemcpy_bad1_loop4:
262#ifdef __ARMEB__
263	mov	r4, ip, lsl #8
264#else
265	mov	r4, ip, lsr #8
266#endif
267	ldr	ip, [r1], #0x04
268	subs	r2, r2, #0x04
269#ifdef __ARMEB__
270	orr	r4, r4, ip, lsr #24
271#else
272	orr	r4, r4, ip, lsl #24
273#endif
274	str	r4, [r3], #0x04
275	bge	.Lmemcpy_bad1_loop4
276	sub	r1, r1, #0x03
277	b	.Lmemcpy_bad_done
278
279.Lmemcpy_bad2_loop16:
280#ifdef __ARMEB__
281	mov	r4, ip, lsl #16
282#else
283	mov	r4, ip, lsr #16
284#endif
285	ldr	r5, [r1], #0x04
286	pld	[r1, #0x018]
287	ldr	r6, [r1], #0x04
288	ldr	r7, [r1], #0x04
289	ldr	ip, [r1], #0x04
290#ifdef __ARMEB__
291	orr	r4, r4, r5, lsr #16
292	mov	r5, r5, lsl #16
293	orr	r5, r5, r6, lsr #16
294	mov	r6, r6, lsl #16
295	orr	r6, r6, r7, lsr #16
296	mov	r7, r7, lsl #16
297	orr	r7, r7, ip, lsr #16
298#else
299	orr	r4, r4, r5, lsl #16
300	mov	r5, r5, lsr #16
301	orr	r5, r5, r6, lsl #16
302	mov	r6, r6, lsr #16
303	orr	r6, r6, r7, lsl #16
304	mov	r7, r7, lsr #16
305	orr	r7, r7, ip, lsl #16
306#endif
307	str	r4, [r3], #0x04
308	str	r5, [r3], #0x04
309	str	r6, [r3], #0x04
310	str	r7, [r3], #0x04
311.Lmemcpy_bad2:
312	subs	r2, r2, #0x10
313	bge	.Lmemcpy_bad2_loop16
314
315	adds	r2, r2, #0x10
316	ldmeqfd	sp!, {r4-r7}
317	bxeq	lr			/* Return now if done */
318	subs	r2, r2, #0x04
319	sublt	r1, r1, #0x02
320	blt	.Lmemcpy_bad_done
321
322.Lmemcpy_bad2_loop4:
323#ifdef __ARMEB__
324	mov	r4, ip, lsl #16
325#else
326	mov	r4, ip, lsr #16
327#endif
328	ldr	ip, [r1], #0x04
329	subs	r2, r2, #0x04
330#ifdef __ARMEB__
331	orr	r4, r4, ip, lsr #16
332#else
333	orr	r4, r4, ip, lsl #16
334#endif
335	str	r4, [r3], #0x04
336	bge	.Lmemcpy_bad2_loop4
337	sub	r1, r1, #0x02
338	b	.Lmemcpy_bad_done
339
340.Lmemcpy_bad3_loop16:
341#ifdef __ARMEB__
342	mov	r4, ip, lsl #24
343#else
344	mov	r4, ip, lsr #24
345#endif
346	ldr	r5, [r1], #0x04
347	pld	[r1, #0x018]
348	ldr	r6, [r1], #0x04
349	ldr	r7, [r1], #0x04
350	ldr	ip, [r1], #0x04
351#ifdef __ARMEB__
352	orr	r4, r4, r5, lsr #8
353	mov	r5, r5, lsl #24
354	orr	r5, r5, r6, lsr #8
355	mov	r6, r6, lsl #24
356	orr	r6, r6, r7, lsr #8
357	mov	r7, r7, lsl #24
358	orr	r7, r7, ip, lsr #8
359#else
360	orr	r4, r4, r5, lsl #8
361	mov	r5, r5, lsr #24
362	orr	r5, r5, r6, lsl #8
363	mov	r6, r6, lsr #24
364	orr	r6, r6, r7, lsl #8
365	mov	r7, r7, lsr #24
366	orr	r7, r7, ip, lsl #8
367#endif
368	str	r4, [r3], #0x04
369	str	r5, [r3], #0x04
370	str	r6, [r3], #0x04
371	str	r7, [r3], #0x04
372.Lmemcpy_bad3:
373	subs	r2, r2, #0x10
374	bge	.Lmemcpy_bad3_loop16
375
376	adds	r2, r2, #0x10
377	ldmeqfd	sp!, {r4-r7}
378	bxeq	lr			/* Return now if done */
379	subs	r2, r2, #0x04
380	sublt	r1, r1, #0x01
381	blt	.Lmemcpy_bad_done
382
383.Lmemcpy_bad3_loop4:
384#ifdef __ARMEB__
385	mov	r4, ip, lsl #24
386#else
387	mov	r4, ip, lsr #24
388#endif
389	ldr	ip, [r1], #0x04
390	subs	r2, r2, #0x04
391#ifdef __ARMEB__
392	orr	r4, r4, ip, lsr #8
393#else
394	orr	r4, r4, ip, lsl #8
395#endif
396	str	r4, [r3], #0x04
397	bge	.Lmemcpy_bad3_loop4
398	sub	r1, r1, #0x01
399
400.Lmemcpy_bad_done:
401	ldmfd	sp!, {r4-r7}
402	adds	r2, r2, #0x04
403	bxeq	lr
404	ldrb	ip, [r1], #0x01
405	cmp	r2, #0x02
406	ldrgeb	r2, [r1], #0x01
407	strb	ip, [r3], #0x01
408	ldrgtb	ip, [r1]
409	strgeb	r2, [r3], #0x01
410	strgtb	ip, [r3]
411	bx	lr
412
413
414/*
415 * Handle short copies (less than 16 bytes), possibly misaligned.
416 * Some of these are *very* common, thanks to the network stack,
417 * and so are handled specially.
418 */
419.Lmemcpy_short:
420#ifndef _STANDALONE
421	add	pc, pc, r2, lsl #2
422	nop
423	bx	lr			/* 0x00 */
424	b	.Lmemcpy_bytewise	/* 0x01 */
425	b	.Lmemcpy_bytewise	/* 0x02 */
426	b	.Lmemcpy_bytewise	/* 0x03 */
427	b	.Lmemcpy_4		/* 0x04 */
428	b	.Lmemcpy_bytewise	/* 0x05 */
429	b	.Lmemcpy_6		/* 0x06 */
430	b	.Lmemcpy_bytewise	/* 0x07 */
431	b	.Lmemcpy_8		/* 0x08 */
432	b	.Lmemcpy_bytewise	/* 0x09 */
433	b	.Lmemcpy_bytewise	/* 0x0a */
434	b	.Lmemcpy_bytewise	/* 0x0b */
435	b	.Lmemcpy_c		/* 0x0c */
436#endif
437.Lmemcpy_bytewise:
438	mov	r3, r0			/* We must not clobber r0 */
439	ldrb	ip, [r1], #0x01
4401:	subs	r2, r2, #0x01
441	strb	ip, [r3], #0x01
442	ldrneb	ip, [r1], #0x01
443	bne	1b
444	bx	lr
445
446#ifndef _STANDALONE
447/******************************************************************************
448 * Special case for 4 byte copies
449 */
450#define	LMEMCPY_4_LOG2	6	/* 64 bytes */
451#define	LMEMCPY_4_PAD	.align LMEMCPY_4_LOG2
452	LMEMCPY_4_PAD
453.Lmemcpy_4:
454	and	r2, r1, #0x03
455	orr	r2, r2, r0, lsl #2
456	ands	r2, r2, #0x0f
457	sub	r3, pc, #0x14
458	addne	pc, r3, r2, lsl #LMEMCPY_4_LOG2
459
460/*
461 * 0000: dst is 32-bit aligned, src is 32-bit aligned
462 */
463	ldr	r2, [r1]
464	str	r2, [r0]
465	bx	lr
466	LMEMCPY_4_PAD
467
468/*
469 * 0001: dst is 32-bit aligned, src is 8-bit aligned
470 */
471	ldr	r3, [r1, #-1]		/* BE:r3 = x012  LE:r3 = 210x */
472	ldr	r2, [r1, #3]		/* BE:r2 = 3xxx  LE:r2 = xxx3 */
473#ifdef __ARMEB__
474	mov	r3, r3, lsl #8		/* r3 = 012. */
475	orr	r3, r3, r2, lsr #24	/* r3 = 0123 */
476#else
477	mov	r3, r3, lsr #8		/* r3 = .210 */
478	orr	r3, r3, r2, lsl #24	/* r3 = 3210 */
479#endif
480	str	r3, [r0]
481	bx	lr
482	LMEMCPY_4_PAD
483
484/*
485 * 0010: dst is 32-bit aligned, src is 16-bit aligned
486 */
487#ifdef __ARMEB__
488	ldrh	r3, [r1]
489	ldrh	r2, [r1, #0x02]
490#else
491	ldrh	r3, [r1, #0x02]
492	ldrh	r2, [r1]
493#endif
494	orr	r3, r2, r3, lsl #16
495	str	r3, [r0]
496	bx	lr
497	LMEMCPY_4_PAD
498
499/*
500 * 0011: dst is 32-bit aligned, src is 8-bit aligned
501 */
502	ldr	r3, [r1, #-3]		/* BE:r3 = xxx0  LE:r3 = 0xxx */
503	ldr	r2, [r1, #1]		/* BE:r2 = 123x  LE:r2 = x321 */
504#ifdef __ARMEB__
505	mov	r3, r3, lsl #24		/* r3 = 0... */
506	orr	r3, r3, r2, lsr #8	/* r3 = 0123 */
507#else
508	mov	r3, r3, lsr #24		/* r3 = ...0 */
509	orr	r3, r3, r2, lsl #8	/* r3 = 3210 */
510#endif
511	str	r3, [r0]
512	bx	lr
513	LMEMCPY_4_PAD
514
515/*
516 * 0100: dst is 8-bit aligned, src is 32-bit aligned
517 */
518	ldr	r2, [r1]
519#ifdef __ARMEB__
520	strb	r2, [r0, #0x03]
521	mov	r3, r2, lsr #8
522	mov	r1, r2, lsr #24
523	strb	r1, [r0]
524#else
525	strb	r2, [r0]
526	mov	r3, r2, lsr #8
527	mov	r1, r2, lsr #24
528	strb	r1, [r0, #0x03]
529#endif
530	strh	r3, [r0, #0x01]
531	bx	lr
532	LMEMCPY_4_PAD
533
534/*
535 * 0101: dst is 8-bit aligned, src is 8-bit aligned
536 */
537	ldrb	r2, [r1]
538	ldrh	r3, [r1, #0x01]
539	ldrb	r1, [r1, #0x03]
540	strb	r2, [r0]
541	strh	r3, [r0, #0x01]
542	strb	r1, [r0, #0x03]
543	bx	lr
544	LMEMCPY_4_PAD
545
546/*
547 * 0110: dst is 8-bit aligned, src is 16-bit aligned
548 */
549	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
550	ldrh	r3, [r1, #0x02]		/* LE:r3 = ..23  LE:r3 = ..32 */
551#ifdef __ARMEB__
552	mov	r1, r2, lsr #8		/* r1 = ...0 */
553	strb	r1, [r0]
554	mov	r2, r2, lsl #8		/* r2 = .01. */
555	orr	r2, r2, r3, lsr #8	/* r2 = .012 */
556#else
557	strb	r2, [r0]
558	mov	r2, r2, lsr #8		/* r2 = ...1 */
559	orr	r2, r2, r3, lsl #8	/* r2 = .321 */
560	mov	r3, r3, lsr #8		/* r3 = ...3 */
561#endif
562	strh	r2, [r0, #0x01]
563	strb	r3, [r0, #0x03]
564	bx	lr
565	LMEMCPY_4_PAD
566
567/*
568 * 0111: dst is 8-bit aligned, src is 8-bit aligned
569 */
570	ldrb	r2, [r1]
571	ldrh	r3, [r1, #0x01]
572	ldrb	r1, [r1, #0x03]
573	strb	r2, [r0]
574	strh	r3, [r0, #0x01]
575	strb	r1, [r0, #0x03]
576	bx	lr
577	LMEMCPY_4_PAD
578
579/*
580 * 1000: dst is 16-bit aligned, src is 32-bit aligned
581 */
582	ldr	r2, [r1]
583#ifdef __ARMEB__
584	strh	r2, [r0, #0x02]
585	mov	r3, r2, lsr #16
586	strh	r3, [r0]
587#else
588	strh	r2, [r0]
589	mov	r3, r2, lsr #16
590	strh	r3, [r0, #0x02]
591#endif
592	bx	lr
593	LMEMCPY_4_PAD
594
595/*
596 * 1001: dst is 16-bit aligned, src is 8-bit aligned
597 */
598	ldr	r2, [r1, #-1]		/* BE:r2 = x012  LE:r2 = 210x */
599	ldr	r3, [r1, #3]		/* BE:r3 = 3xxx  LE:r3 = xxx3 */
600	mov	r1, r2, lsr #8		/* BE:r1 = .x01  LE:r1 = .210 */
601	strh	r1, [r0]
602#ifdef __ARMEB__
603	mov	r2, r2, lsl #8		/* r2 = 012. */
604	orr	r2, r2, r3, lsr #24	/* r2 = 0123 */
605#else
606	mov	r2, r2, lsr #24		/* r2 = ...2 */
607	orr	r2, r2, r3, lsl #8	/* r2 = xx32 */
608#endif
609	strh	r2, [r0, #0x02]
610	bx	lr
611	LMEMCPY_4_PAD
612
613/*
614 * 1010: dst is 16-bit aligned, src is 16-bit aligned
615 */
616	ldrh	r2, [r1]
617	ldrh	r3, [r1, #0x02]
618	strh	r2, [r0]
619	strh	r3, [r0, #0x02]
620	bx	lr
621	LMEMCPY_4_PAD
622
623/*
624 * 1011: dst is 16-bit aligned, src is 8-bit aligned
625 */
626	ldr	r3, [r1, #1]		/* BE:r3 = 123x  LE:r3 = x321 */
627	ldr	r2, [r1, #-3]		/* BE:r2 = xxx0  LE:r2 = 0xxx */
628	mov	r1, r3, lsr #8		/* BE:r1 = .123  LE:r1 = .x32 */
629	strh	r1, [r0, #0x02]
630#ifdef __ARMEB__
631	mov	r3, r3, lsr #24		/* r3 = ...1 */
632	orr	r3, r3, r2, lsl #8	/* r3 = xx01 */
633#else
634	mov	r3, r3, lsl #8		/* r3 = 321. */
635	orr	r3, r3, r2, lsr #24	/* r3 = 3210 */
636#endif
637	strh	r3, [r0]
638	bx	lr
639	LMEMCPY_4_PAD
640
641/*
642 * 1100: dst is 8-bit aligned, src is 32-bit aligned
643 */
644	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
645#ifdef __ARMEB__
646	strb	r2, [r0, #0x03]
647	mov	r3, r2, lsr #8
648	mov	r1, r2, lsr #24
649	strh	r3, [r0, #0x01]
650	strb	r1, [r0]
651#else
652	strb	r2, [r0]
653	mov	r3, r2, lsr #8
654	mov	r1, r2, lsr #24
655	strh	r3, [r0, #0x01]
656	strb	r1, [r0, #0x03]
657#endif
658	bx	lr
659	LMEMCPY_4_PAD
660
661/*
662 * 1101: dst is 8-bit aligned, src is 8-bit aligned
663 */
664	ldrb	r2, [r1]
665	ldrh	r3, [r1, #0x01]
666	ldrb	r1, [r1, #0x03]
667	strb	r2, [r0]
668	strh	r3, [r0, #0x01]
669	strb	r1, [r0, #0x03]
670	bx	lr
671	LMEMCPY_4_PAD
672
673/*
674 * 1110: dst is 8-bit aligned, src is 16-bit aligned
675 */
676#ifdef __ARMEB__
677	ldrh	r3, [r1, #0x02]		/* BE:r3 = ..23  LE:r3 = ..32 */
678	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
679	strb	r3, [r0, #0x03]
680	mov	r3, r3, lsr #8		/* r3 = ...2 */
681	orr	r3, r3, r2, lsl #8	/* r3 = ..12 */
682	strh	r3, [r0, #0x01]
683	mov	r2, r2, lsr #8		/* r2 = ...0 */
684	strb	r2, [r0]
685#else
686	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
687	ldrh	r3, [r1, #0x02]		/* BE:r3 = ..23  LE:r3 = ..32 */
688	strb	r2, [r0]
689	mov	r2, r2, lsr #8		/* r2 = ...1 */
690	orr	r2, r2, r3, lsl #8	/* r2 = .321 */
691	strh	r2, [r0, #0x01]
692	mov	r3, r3, lsr #8		/* r3 = ...3 */
693	strb	r3, [r0, #0x03]
694#endif
695	bx	lr
696	LMEMCPY_4_PAD
697
698/*
699 * 1111: dst is 8-bit aligned, src is 8-bit aligned
700 */
701	ldrb	r2, [r1]
702	ldrh	r3, [r1, #0x01]
703	ldrb	r1, [r1, #0x03]
704	strb	r2, [r0]
705	strh	r3, [r0, #0x01]
706	strb	r1, [r0, #0x03]
707	bx	lr
708	LMEMCPY_4_PAD
709
710
711/******************************************************************************
712 * Special case for 6 byte copies
713 */
714#define	LMEMCPY_6_LOG2	6	/* 64 bytes */
715#define	LMEMCPY_6_PAD	.align LMEMCPY_6_LOG2
716	LMEMCPY_6_PAD
717.Lmemcpy_6:
718	and	r2, r1, #0x03
719	orr	r2, r2, r0, lsl #2
720	ands	r2, r2, #0x0f
721	sub	r3, pc, #0x14
722	addne	pc, r3, r2, lsl #LMEMCPY_6_LOG2
723
724/*
725 * 0000: dst is 32-bit aligned, src is 32-bit aligned
726 */
727	ldr	r2, [r1]
728	ldrh	r3, [r1, #0x04]
729	str	r2, [r0]
730	strh	r3, [r0, #0x04]
731	bx	lr
732	LMEMCPY_6_PAD
733
734/*
735 * 0001: dst is 32-bit aligned, src is 8-bit aligned
736 */
737	ldr	r2, [r1, #-1]		/* BE:r2 = x012  LE:r2 = 210x */
738	ldr	r3, [r1, #0x03]		/* BE:r3 = 345x  LE:r3 = x543 */
739#ifdef __ARMEB__
740	mov	r2, r2, lsl #8		/* r2 = 012. */
741	orr	r2, r2, r3, lsr #24	/* r2 = 0123 */
742#else
743	mov	r2, r2, lsr #8		/* r2 = .210 */
744	orr	r2, r2, r3, lsl #24	/* r2 = 3210 */
745#endif
746	mov	r3, r3, lsr #8		/* BE:r3 = .345  LE:r3 = .x54 */
747	str	r2, [r0]
748	strh	r3, [r0, #0x04]
749	bx	lr
750	LMEMCPY_6_PAD
751
752/*
753 * 0010: dst is 32-bit aligned, src is 16-bit aligned
754 */
755	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
756	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
757#ifdef __ARMEB__
758	mov	r1, r3, lsr #16		/* r1 = ..23 */
759	orr	r1, r1, r2, lsl #16	/* r1 = 0123 */
760	str	r1, [r0]
761	strh	r3, [r0, #0x04]
762#else
763	mov	r1, r3, lsr #16		/* r1 = ..54 */
764	orr	r2, r2, r3, lsl #16	/* r2 = 3210 */
765	str	r2, [r0]
766	strh	r1, [r0, #0x04]
767#endif
768	bx	lr
769	LMEMCPY_6_PAD
770
771/*
772 * 0011: dst is 32-bit aligned, src is 8-bit aligned
773 */
774	ldr	r2, [r1, #-3]		/* BE:r2 = xxx0  LE:r2 = 0xxx */
775	ldr	r3, [r1, #1]		/* BE:r3 = 1234  LE:r3 = 4321 */
776	ldr	r1, [r1, #5]		/* BE:r1 = 5xxx  LE:r3 = xxx5 */
777#ifdef __ARMEB__
778	mov	r2, r2, lsl #24		/* r2 = 0... */
779	orr	r2, r2, r3, lsr #8	/* r2 = 0123 */
780	mov	r3, r3, lsl #8		/* r3 = 234. */
781	orr	r1, r3, r1, lsr #24	/* r1 = 2345 */
782#else
783	mov	r2, r2, lsr #24		/* r2 = ...0 */
784	orr	r2, r2, r3, lsl #8	/* r2 = 3210 */
785	mov	r1, r1, lsl #8		/* r1 = xx5. */
786	orr	r1, r1, r3, lsr #24	/* r1 = xx54 */
787#endif
788	str	r2, [r0]
789	strh	r1, [r0, #0x04]
790	bx	lr
791	LMEMCPY_6_PAD
792
793/*
794 * 0100: dst is 8-bit aligned, src is 32-bit aligned
795 */
796	ldr	r3, [r1]		/* BE:r3 = 0123  LE:r3 = 3210 */
797	ldrh	r2, [r1, #0x04]		/* BE:r2 = ..45  LE:r2 = ..54 */
798	mov	r1, r3, lsr #8		/* BE:r1 = .012  LE:r1 = .321 */
799	strh	r1, [r0, #0x01]
800#ifdef __ARMEB__
801	mov	r1, r3, lsr #24		/* r1 = ...0 */
802	strb	r1, [r0]
803	mov	r3, r3, lsl #8		/* r3 = 123. */
804	orr	r3, r3, r2, lsr #8	/* r3 = 1234 */
805#else
806	strb	r3, [r0]
807	mov	r3, r3, lsr #24		/* r3 = ...3 */
808	orr	r3, r3, r2, lsl #8	/* r3 = .543 */
809	mov	r2, r2, lsr #8		/* r2 = ...5 */
810#endif
811	strh	r3, [r0, #0x03]
812	strb	r2, [r0, #0x05]
813	bx	lr
814	LMEMCPY_6_PAD
815
816/*
817 * 0101: dst is 8-bit aligned, src is 8-bit aligned
818 */
819	ldrb	r2, [r1]
820	ldrh	r3, [r1, #0x01]
821	ldrh	ip, [r1, #0x03]
822	ldrb	r1, [r1, #0x05]
823	strb	r2, [r0]
824	strh	r3, [r0, #0x01]
825	strh	ip, [r0, #0x03]
826	strb	r1, [r0, #0x05]
827	bx	lr
828	LMEMCPY_6_PAD
829
830/*
831 * 0110: dst is 8-bit aligned, src is 16-bit aligned
832 */
833	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
834	ldr	r1, [r1, #0x02]		/* BE:r1 = 2345  LE:r1 = 5432 */
835#ifdef __ARMEB__
836	mov	r3, r2, lsr #8		/* r3 = ...0 */
837	strb	r3, [r0]
838	strb	r1, [r0, #0x05]
839	mov	r3, r1, lsr #8		/* r3 = .234 */
840	strh	r3, [r0, #0x03]
841	mov	r3, r2, lsl #8		/* r3 = .01. */
842	orr	r3, r3, r1, lsr #24	/* r3 = .012 */
843	strh	r3, [r0, #0x01]
844#else
845	strb	r2, [r0]
846	mov	r3, r1, lsr #24
847	strb	r3, [r0, #0x05]
848	mov	r3, r1, lsr #8		/* r3 = .543 */
849	strh	r3, [r0, #0x03]
850	mov	r3, r2, lsr #8		/* r3 = ...1 */
851	orr	r3, r3, r1, lsl #8	/* r3 = 4321 */
852	strh	r3, [r0, #0x01]
853#endif
854	bx	lr
855	LMEMCPY_6_PAD
856
857/*
858 * 0111: dst is 8-bit aligned, src is 8-bit aligned
859 */
860	ldrb	r2, [r1]
861	ldrh	r3, [r1, #0x01]
862	ldrh	ip, [r1, #0x03]
863	ldrb	r1, [r1, #0x05]
864	strb	r2, [r0]
865	strh	r3, [r0, #0x01]
866	strh	ip, [r0, #0x03]
867	strb	r1, [r0, #0x05]
868	bx	lr
869	LMEMCPY_6_PAD
870
871/*
872 * 1000: dst is 16-bit aligned, src is 32-bit aligned
873 */
874#ifdef __ARMEB__
875	ldr	r2, [r1]		/* r2 = 0123 */
876	ldrh	r3, [r1, #0x04]		/* r3 = ..45 */
877	mov	r1, r2, lsr #16		/* r1 = ..01 */
878	orr	r3, r3, r2, lsl#16	/* r3 = 2345 */
879	strh	r1, [r0]
880	str	r3, [r0, #0x02]
881#else
882	ldrh	r2, [r1, #0x04]		/* r2 = ..54 */
883	ldr	r3, [r1]		/* r3 = 3210 */
884	mov	r2, r2, lsl #16		/* r2 = 54.. */
885	orr	r2, r2, r3, lsr #16	/* r2 = 5432 */
886	strh	r3, [r0]
887	str	r2, [r0, #0x02]
888#endif
889	bx	lr
890	LMEMCPY_6_PAD
891
892/*
893 * 1001: dst is 16-bit aligned, src is 8-bit aligned
894 */
895	ldr	r3, [r1, #-1]		/* BE:r3 = x012  LE:r3 = 210x */
896	ldr	r2, [r1, #3]		/* BE:r2 = 345x  LE:r2 = x543 */
897	mov	r1, r3, lsr #8		/* BE:r1 = .x01  LE:r1 = .210 */
898#ifdef __ARMEB__
899	mov	r2, r2, lsr #8		/* r2 = .345 */
900	orr	r2, r2, r3, lsl #24	/* r2 = 2345 */
901#else
902	mov	r2, r2, lsl #8		/* r2 = 543. */
903	orr	r2, r2, r3, lsr #24	/* r2 = 5432 */
904#endif
905	strh	r1, [r0]
906	str	r2, [r0, #0x02]
907	bx	lr
908	LMEMCPY_6_PAD
909
910/*
911 * 1010: dst is 16-bit aligned, src is 16-bit aligned
912 */
913	ldrh	r2, [r1]
914	ldr	r3, [r1, #0x02]
915	strh	r2, [r0]
916	str	r3, [r0, #0x02]
917	bx	lr
918	LMEMCPY_6_PAD
919
920/*
921 * 1011: dst is 16-bit aligned, src is 8-bit aligned
922 */
923	ldrb	r3, [r1]		/* r3 = ...0 */
924	ldr	r2, [r1, #0x01]		/* BE:r2 = 1234  LE:r2 = 4321 */
925	ldrb	r1, [r1, #0x05]		/* r1 = ...5 */
926#ifdef __ARMEB__
927	mov	r3, r3, lsl #8		/* r3 = ..0. */
928	orr	r3, r3, r2, lsr #24	/* r3 = ..01 */
929	orr	r1, r1, r2, lsl #8	/* r1 = 2345 */
930#else
931	orr	r3, r3, r2, lsl #8	/* r3 = 3210 */
932	mov	r1, r1, lsl #24		/* r1 = 5... */
933	orr	r1, r1, r2, lsr #8	/* r1 = 5432 */
934#endif
935	strh	r3, [r0]
936	str	r1, [r0, #0x02]
937	bx	lr
938	LMEMCPY_6_PAD
939
940/*
941 * 1100: dst is 8-bit aligned, src is 32-bit aligned
942 */
943	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
944	ldrh	r1, [r1, #0x04]		/* BE:r1 = ..45  LE:r1 = ..54 */
945#ifdef __ARMEB__
946	mov	r3, r2, lsr #24		/* r3 = ...0 */
947	strb	r3, [r0]
948	mov	r2, r2, lsl #8		/* r2 = 123. */
949	orr	r2, r2, r1, lsr #8	/* r2 = 1234 */
950#else
951	strb	r2, [r0]
952	mov	r2, r2, lsr #8		/* r2 = .321 */
953	orr	r2, r2, r1, lsl #24	/* r2 = 4321 */
954	mov	r1, r1, lsr #8		/* r1 = ...5 */
955#endif
956	str	r2, [r0, #0x01]
957	strb	r1, [r0, #0x05]
958	bx	lr
959	LMEMCPY_6_PAD
960
961/*
962 * 1101: dst is 8-bit aligned, src is 8-bit aligned
963 */
964	ldrb	r2, [r1]
965	ldrh	r3, [r1, #0x01]
966	ldrh	ip, [r1, #0x03]
967	ldrb	r1, [r1, #0x05]
968	strb	r2, [r0]
969	strh	r3, [r0, #0x01]
970	strh	ip, [r0, #0x03]
971	strb	r1, [r0, #0x05]
972	bx	lr
973	LMEMCPY_6_PAD
974
975/*
976 * 1110: dst is 8-bit aligned, src is 16-bit aligned
977 */
978	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
979	ldr	r1, [r1, #0x02]		/* BE:r1 = 2345  LE:r1 = 5432 */
980#ifdef __ARMEB__
981	mov	r3, r2, lsr #8		/* r3 = ...0 */
982	strb	r3, [r0]
983	mov	r2, r2, lsl #24		/* r2 = 1... */
984	orr	r2, r2, r1, lsr #8	/* r2 = 1234 */
985#else
986	strb	r2, [r0]
987	mov	r2, r2, lsr #8		/* r2 = ...1 */
988	orr	r2, r2, r1, lsl #8	/* r2 = 4321 */
989	mov	r1, r1, lsr #24		/* r1 = ...5 */
990#endif
991	str	r2, [r0, #0x01]
992	strb	r1, [r0, #0x05]
993	bx	lr
994	LMEMCPY_6_PAD
995
996/*
997 * 1111: dst is 8-bit aligned, src is 8-bit aligned
998 */
999	ldrb	r2, [r1]
1000	ldr	r3, [r1, #0x01]
1001	ldrb	r1, [r1, #0x05]
1002	strb	r2, [r0]
1003	str	r3, [r0, #0x01]
1004	strb	r1, [r0, #0x05]
1005	bx	lr
1006	LMEMCPY_6_PAD
1007
1008
1009/******************************************************************************
1010 * Special case for 8 byte copies
1011 */
1012#define	LMEMCPY_8_LOG2	6	/* 64 bytes */
1013#define	LMEMCPY_8_PAD	.align LMEMCPY_8_LOG2
1014	LMEMCPY_8_PAD
1015.Lmemcpy_8:
1016	and	r2, r1, #0x03
1017	orr	r2, r2, r0, lsl #2
1018	ands	r2, r2, #0x0f
1019	sub	r3, pc, #0x14
1020	addne	pc, r3, r2, lsl #LMEMCPY_8_LOG2
1021
1022/*
1023 * 0000: dst is 32-bit aligned, src is 32-bit aligned
1024 */
1025	ldr	r2, [r1]
1026	ldr	r3, [r1, #0x04]
1027	str	r2, [r0]
1028	str	r3, [r0, #0x04]
1029	bx	lr
1030	LMEMCPY_8_PAD
1031
1032/*
1033 * 0001: dst is 32-bit aligned, src is 8-bit aligned
1034 */
1035	ldr	r3, [r1, #-1]		/* BE:r3 = x012  LE:r3 = 210x */
1036	ldr	r2, [r1, #0x03]		/* BE:r2 = 3456  LE:r2 = 6543 */
1037	ldrb	r1, [r1, #0x07]		/* r1 = ...7 */
1038#ifdef __ARMEB__
1039	mov	r3, r3, lsl #8		/* r3 = 012. */
1040	orr	r3, r3, r2, lsr #24	/* r3 = 0123 */
1041	orr	r2, r1, r2, lsl #8	/* r2 = 4567 */
1042#else
1043	mov	r3, r3, lsr #8		/* r3 = .210 */
1044	orr	r3, r3, r2, lsl #24	/* r3 = 3210 */
1045	mov	r1, r1, lsl #24		/* r1 = 7... */
1046	orr	r2, r1, r2, lsr #8	/* r2 = 7654 */
1047#endif
1048	str	r3, [r0]
1049	str	r2, [r0, #0x04]
1050	bx	lr
1051	LMEMCPY_8_PAD
1052
1053/*
1054 * 0010: dst is 32-bit aligned, src is 16-bit aligned
1055 */
1056	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1057	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
1058	ldrh	r1, [r1, #0x06]		/* BE:r1 = ..67  LE:r1 = ..76 */
1059#ifdef __ARMEB__
1060	mov	r2, r2, lsl #16		/* r2 = 01.. */
1061	orr	r2, r2, r3, lsr #16	/* r2 = 0123 */
1062	orr	r3, r1, r3, lsl #16	/* r3 = 4567 */
1063#else
1064	orr	r2, r2, r3, lsl #16	/* r2 = 3210 */
1065	mov	r3, r3, lsr #16		/* r3 = ..54 */
1066	orr	r3, r3, r1, lsl #16	/* r3 = 7654 */
1067#endif
1068	str	r2, [r0]
1069	str	r3, [r0, #0x04]
1070	bx	lr
1071	LMEMCPY_8_PAD
1072
1073/*
1074 * 0011: dst is 32-bit aligned, src is 8-bit aligned
1075 */
1076	ldrb	r3, [r1]		/* r3 = ...0 */
1077	ldr	r2, [r1, #0x01]		/* BE:r2 = 1234  LE:r2 = 4321 */
1078	ldr	r1, [r1, #0x05]		/* BE:r1 = 567x  LE:r1 = x765 */
1079#ifdef __ARMEB__
1080	mov	r3, r3, lsl #24		/* r3 = 0... */
1081	orr	r3, r3, r2, lsr #8	/* r3 = 0123 */
1082	mov	r2, r2, lsl #24		/* r2 = 4... */
1083	orr	r2, r2, r1, lsr #8	/* r2 = 4567 */
1084#else
1085	orr	r3, r3, r2, lsl #8	/* r3 = 3210 */
1086	mov	r2, r2, lsr #24		/* r2 = ...4 */
1087	orr	r2, r2, r1, lsl #8	/* r2 = 7654 */
1088#endif
1089	str	r3, [r0]
1090	str	r2, [r0, #0x04]
1091	bx	lr
1092	LMEMCPY_8_PAD
1093
1094/*
1095 * 0100: dst is 8-bit aligned, src is 32-bit aligned
1096 */
1097	ldr	r3, [r1]		/* BE:r3 = 0123  LE:r3 = 3210 */
1098	ldr	r2, [r1, #0x04]		/* BE:r2 = 4567  LE:r2 = 7654 */
1099#ifdef __ARMEB__
1100	mov	r1, r3, lsr #24		/* r1 = ...0 */
1101	strb	r1, [r0]
1102	mov	r1, r3, lsr #8		/* r1 = .012 */
1103	strb	r2, [r0, #0x07]
1104	mov	r3, r3, lsl #24		/* r3 = 3... */
1105	orr	r3, r3, r2, lsr #8	/* r3 = 3456 */
1106#else
1107	strb	r3, [r0]
1108	mov	r1, r2, lsr #24		/* r1 = ...7 */
1109	strb	r1, [r0, #0x07]
1110	mov	r1, r3, lsr #8		/* r1 = .321 */
1111	mov	r3, r3, lsr #24		/* r3 = ...3 */
1112	orr	r3, r3, r2, lsl #8	/* r3 = 6543 */
1113#endif
1114	strh	r1, [r0, #0x01]
1115	str	r3, [r0, #0x03]
1116	bx	lr
1117	LMEMCPY_8_PAD
1118
1119/*
1120 * 0101: dst is 8-bit aligned, src is 8-bit aligned
1121 */
1122	ldrb	r2, [r1]
1123	ldrh	r3, [r1, #0x01]
1124	ldr	ip, [r1, #0x03]
1125	ldrb	r1, [r1, #0x07]
1126	strb	r2, [r0]
1127	strh	r3, [r0, #0x01]
1128	str	ip, [r0, #0x03]
1129	strb	r1, [r0, #0x07]
1130	bx	lr
1131	LMEMCPY_8_PAD
1132
1133/*
1134 * 0110: dst is 8-bit aligned, src is 16-bit aligned
1135 */
1136	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1137	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
1138	ldrh	r1, [r1, #0x06]		/* BE:r1 = ..67  LE:r1 = ..76 */
1139#ifdef __ARMEB__
1140	mov	ip, r2, lsr #8		/* ip = ...0 */
1141	strb	ip, [r0]
1142	mov	ip, r2, lsl #8		/* ip = .01. */
1143	orr	ip, ip, r3, lsr #24	/* ip = .012 */
1144	strb	r1, [r0, #0x07]
1145	mov	r3, r3, lsl #8		/* r3 = 345. */
1146	orr	r3, r3, r1, lsr #8	/* r3 = 3456 */
1147#else
1148	strb	r2, [r0]		/* 0 */
1149	mov	ip, r1, lsr #8		/* ip = ...7 */
1150	strb	ip, [r0, #0x07]		/* 7 */
1151	mov	ip, r2, lsr #8		/* ip = ...1 */
1152	orr	ip, ip, r3, lsl #8	/* ip = 4321 */
1153	mov	r3, r3, lsr #8		/* r3 = .543 */
1154	orr	r3, r3, r1, lsl #24	/* r3 = 6543 */
1155#endif
1156	strh	ip, [r0, #0x01]
1157	str	r3, [r0, #0x03]
1158	bx	lr
1159	LMEMCPY_8_PAD
1160
1161/*
1162 * 0111: dst is 8-bit aligned, src is 8-bit aligned
1163 */
1164	ldrb	r3, [r1]		/* r3 = ...0 */
1165	ldr	ip, [r1, #0x01]		/* BE:ip = 1234  LE:ip = 4321 */
1166	ldrh	r2, [r1, #0x05]		/* BE:r2 = ..56  LE:r2 = ..65 */
1167	ldrb	r1, [r1, #0x07]		/* r1 = ...7 */
1168	strb	r3, [r0]
1169	mov	r3, ip, lsr #16		/* BE:r3 = ..12  LE:r3 = ..43 */
1170#ifdef __ARMEB__
1171	strh	r3, [r0, #0x01]
1172	orr	r2, r2, ip, lsl #16	/* r2 = 3456 */
1173#else
1174	strh	ip, [r0, #0x01]
1175	orr	r2, r3, r2, lsl #16	/* r2 = 6543 */
1176#endif
1177	str	r2, [r0, #0x03]
1178	strb	r1, [r0, #0x07]
1179	bx	lr
1180	LMEMCPY_8_PAD
1181
1182/*
1183 * 1000: dst is 16-bit aligned, src is 32-bit aligned
1184 */
1185	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
1186	ldr	r3, [r1, #0x04]		/* BE:r3 = 4567  LE:r3 = 7654 */
1187	mov	r1, r2, lsr #16		/* BE:r1 = ..01  LE:r1 = ..32 */
1188#ifdef __ARMEB__
1189	strh	r1, [r0]
1190	mov	r1, r3, lsr #16		/* r1 = ..45 */
1191	orr	r2, r1 ,r2, lsl #16	/* r2 = 2345 */
1192#else
1193	strh	r2, [r0]
1194	orr	r2, r1, r3, lsl #16	/* r2 = 5432 */
1195	mov	r3, r3, lsr #16		/* r3 = ..76 */
1196#endif
1197	str	r2, [r0, #0x02]
1198	strh	r3, [r0, #0x06]
1199	bx	lr
1200	LMEMCPY_8_PAD
1201
1202/*
1203 * 1001: dst is 16-bit aligned, src is 8-bit aligned
1204 */
1205	ldr	r2, [r1, #-1]		/* BE:r2 = x012  LE:r2 = 210x */
1206	ldr	r3, [r1, #0x03]		/* BE:r3 = 3456  LE:r3 = 6543 */
1207	ldrb	ip, [r1, #0x07]		/* ip = ...7 */
1208	mov	r1, r2, lsr #8		/* BE:r1 = .x01  LE:r1 = .210 */
1209	strh	r1, [r0]
1210#ifdef __ARMEB__
1211	mov	r1, r2, lsl #24		/* r1 = 2... */
1212	orr	r1, r1, r3, lsr #8	/* r1 = 2345 */
1213	orr	r3, ip, r3, lsl #8	/* r3 = 4567 */
1214#else
1215	mov	r1, r2, lsr #24		/* r1 = ...2 */
1216	orr	r1, r1, r3, lsl #8	/* r1 = 5432 */
1217	mov	r3, r3, lsr #24		/* r3 = ...6 */
1218	orr	r3, r3, ip, lsl #8	/* r3 = ..76 */
1219#endif
1220	str	r1, [r0, #0x02]
1221	strh	r3, [r0, #0x06]
1222	bx	lr
1223	LMEMCPY_8_PAD
1224
1225/*
1226 * 1010: dst is 16-bit aligned, src is 16-bit aligned
1227 */
1228	ldrh	r2, [r1]
1229	ldr	ip, [r1, #0x02]
1230	ldrh	r3, [r1, #0x06]
1231	strh	r2, [r0]
1232	str	ip, [r0, #0x02]
1233	strh	r3, [r0, #0x06]
1234	bx	lr
1235	LMEMCPY_8_PAD
1236
1237/*
1238 * 1011: dst is 16-bit aligned, src is 8-bit aligned
1239 */
1240	ldr	r3, [r1, #0x05]		/* BE:r3 = 567x  LE:r3 = x765 */
1241	ldr	r2, [r1, #0x01]		/* BE:r2 = 1234  LE:r2 = 4321 */
1242	ldrb	ip, [r1]		/* ip = ...0 */
1243	mov	r1, r3, lsr #8		/* BE:r1 = .567  LE:r1 = .x76 */
1244	strh	r1, [r0, #0x06]
1245#ifdef __ARMEB__
1246	mov	r3, r3, lsr #24		/* r3 = ...5 */
1247	orr	r3, r3, r2, lsl #8	/* r3 = 2345 */
1248	mov	r2, r2, lsr #24		/* r2 = ...1 */
1249	orr	r2, r2, ip, lsl #8	/* r2 = ..01 */
1250#else
1251	mov	r3, r3, lsl #24		/* r3 = 5... */
1252	orr	r3, r3, r2, lsr #8	/* r3 = 5432 */
1253	orr	r2, ip, r2, lsl #8	/* r2 = 3210 */
1254#endif
1255	str	r3, [r0, #0x02]
1256	strh	r2, [r0]
1257	bx	lr
1258	LMEMCPY_8_PAD
1259
1260/*
1261 * 1100: dst is 8-bit aligned, src is 32-bit aligned
1262 */
1263	ldr	r3, [r1, #0x04]		/* BE:r3 = 4567  LE:r3 = 7654 */
1264	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
1265	mov	r1, r3, lsr #8		/* BE:r1 = .456  LE:r1 = .765 */
1266	strh	r1, [r0, #0x05]
1267#ifdef __ARMEB__
1268	strb	r3, [r0, #0x07]
1269	mov	r1, r2, lsr #24		/* r1 = ...0 */
1270	strb	r1, [r0]
1271	mov	r2, r2, lsl #8		/* r2 = 123. */
1272	orr	r2, r2, r3, lsr #24	/* r2 = 1234 */
1273	str	r2, [r0, #0x01]
1274#else
1275	strb	r2, [r0]
1276	mov	r1, r3, lsr #24		/* r1 = ...7 */
1277	strb	r1, [r0, #0x07]
1278	mov	r2, r2, lsr #8		/* r2 = .321 */
1279	orr	r2, r2, r3, lsl #24	/* r2 = 4321 */
1280	str	r2, [r0, #0x01]
1281#endif
1282	bx	lr
1283	LMEMCPY_8_PAD
1284
1285/*
1286 * 1101: dst is 8-bit aligned, src is 8-bit aligned
1287 */
1288	ldrb	r3, [r1]		/* r3 = ...0 */
1289	ldrh	r2, [r1, #0x01]		/* BE:r2 = ..12  LE:r2 = ..21 */
1290	ldr	ip, [r1, #0x03]		/* BE:ip = 3456  LE:ip = 6543 */
1291	ldrb	r1, [r1, #0x07]		/* r1 = ...7 */
1292	strb	r3, [r0]
1293	mov	r3, ip, lsr #16		/* BE:r3 = ..34  LE:r3 = ..65 */
1294#ifdef __ARMEB__
1295	strh	ip, [r0, #0x05]
1296	orr	r2, r3, r2, lsl #16	/* r2 = 1234 */
1297#else
1298	strh	r3, [r0, #0x05]
1299	orr	r2, r2, ip, lsl #16	/* r2 = 4321 */
1300#endif
1301	str	r2, [r0, #0x01]
1302	strb	r1, [r0, #0x07]
1303	bx	lr
1304	LMEMCPY_8_PAD
1305
1306/*
1307 * 1110: dst is 8-bit aligned, src is 16-bit aligned
1308 */
1309	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1310	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
1311	ldrh	r1, [r1, #0x06]		/* BE:r1 = ..67  LE:r1 = ..76 */
1312#ifdef __ARMEB__
1313	mov	ip, r2, lsr #8		/* ip = ...0 */
1314	strb	ip, [r0]
1315	mov	ip, r2, lsl #24		/* ip = 1... */
1316	orr	ip, ip, r3, lsr #8	/* ip = 1234 */
1317	strb	r1, [r0, #0x07]
1318	mov	r1, r1, lsr #8		/* r1 = ...6 */
1319	orr	r1, r1, r3, lsl #8	/* r1 = 3456 */
1320#else
1321	strb	r2, [r0]
1322	mov	ip, r2, lsr #8		/* ip = ...1 */
1323	orr	ip, ip, r3, lsl #8	/* ip = 4321 */
1324	mov	r2, r1, lsr #8		/* r2 = ...7 */
1325	strb	r2, [r0, #0x07]
1326	mov	r1, r1, lsl #8		/* r1 = .76. */
1327	orr	r1, r1, r3, lsr #24	/* r1 = .765 */
1328#endif
1329	str	ip, [r0, #0x01]
1330	strh	r1, [r0, #0x05]
1331	bx	lr
1332	LMEMCPY_8_PAD
1333
1334/*
1335 * 1111: dst is 8-bit aligned, src is 8-bit aligned
1336 */
1337	ldrb	r2, [r1]
1338	ldr	ip, [r1, #0x01]
1339	ldrh	r3, [r1, #0x05]
1340	ldrb	r1, [r1, #0x07]
1341	strb	r2, [r0]
1342	str	ip, [r0, #0x01]
1343	strh	r3, [r0, #0x05]
1344	strb	r1, [r0, #0x07]
1345	bx	lr
1346	LMEMCPY_8_PAD
1347
1348/******************************************************************************
1349 * Special case for 12 byte copies
1350 */
1351#define	LMEMCPY_C_LOG2	7	/* 128 bytes */
1352#define	LMEMCPY_C_PAD	.align LMEMCPY_C_LOG2
1353	LMEMCPY_C_PAD
1354.Lmemcpy_c:
1355	and	r2, r1, #0x03
1356	orr	r2, r2, r0, lsl #2
1357	ands	r2, r2, #0x0f
1358	sub	r3, pc, #0x14
1359	addne	pc, r3, r2, lsl #LMEMCPY_C_LOG2
1360
1361/*
1362 * 0000: dst is 32-bit aligned, src is 32-bit aligned
1363 */
1364	ldr	r2, [r1]
1365	ldr	r3, [r1, #0x04]
1366	ldr	r1, [r1, #0x08]
1367	str	r2, [r0]
1368	str	r3, [r0, #0x04]
1369	str	r1, [r0, #0x08]
1370	bx	lr
1371	LMEMCPY_C_PAD
1372
1373/*
1374 * 0001: dst is 32-bit aligned, src is 8-bit aligned
1375 */
1376	ldrb	r2, [r1, #0xb]		/* r2 = ...B */
1377	ldr	ip, [r1, #0x07]		/* BE:ip = 789A  LE:ip = A987 */
1378	ldr	r3, [r1, #0x03]		/* BE:r3 = 3456  LE:r3 = 6543 */
1379	ldr	r1, [r1, #-1]		/* BE:r1 = x012  LE:r1 = 210x */
1380#ifdef __ARMEB__
1381	orr	r2, r2, ip, lsl #8	/* r2 = 89AB */
1382	str	r2, [r0, #0x08]
1383	mov	r2, ip, lsr #24		/* r2 = ...7 */
1384	orr	r2, r2, r3, lsl #8	/* r2 = 4567 */
1385	mov	r1, r1, lsl #8		/* r1 = 012. */
1386	orr	r1, r1, r3, lsr #24	/* r1 = 0123 */
1387#else
1388	mov	r2, r2, lsl #24		/* r2 = B... */
1389	orr	r2, r2, ip, lsr #8	/* r2 = BA98 */
1390	str	r2, [r0, #0x08]
1391	mov	r2, ip, lsl #24		/* r2 = 7... */
1392	orr	r2, r2, r3, lsr #8	/* r2 = 7654 */
1393	mov	r1, r1, lsr #8		/* r1 = .210 */
1394	orr	r1, r1, r3, lsl #24	/* r1 = 3210 */
1395#endif
1396	str	r2, [r0, #0x04]
1397	str	r1, [r0]
1398	bx	lr
1399	LMEMCPY_C_PAD
1400
1401/*
1402 * 0010: dst is 32-bit aligned, src is 16-bit aligned
1403 */
1404	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1405	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
1406	ldr	ip, [r1, #0x06]		/* BE:ip = 6789  LE:ip = 9876 */
1407	ldrh	r1, [r1, #0x0a]		/* BE:r1 = ..AB  LE:r1 = ..BA */
1408#ifdef __ARMEB__
1409	mov	r2, r2, lsl #16		/* r2 = 01.. */
1410	orr	r2, r2, r3, lsr #16	/* r2 = 0123 */
1411	str	r2, [r0]
1412	mov	r3, r3, lsl #16		/* r3 = 45.. */
1413	orr	r3, r3, ip, lsr #16	/* r3 = 4567 */
1414	orr	r1, r1, ip, lsl #16	/* r1 = 89AB */
1415#else
1416	orr	r2, r2, r3, lsl #16	/* r2 = 3210 */
1417	str	r2, [r0]
1418	mov	r3, r3, lsr #16		/* r3 = ..54 */
1419	orr	r3, r3, ip, lsl #16	/* r3 = 7654 */
1420	mov	r1, r1, lsl #16		/* r1 = BA.. */
1421	orr	r1, r1, ip, lsr #16	/* r1 = BA98 */
1422#endif
1423	str	r3, [r0, #0x04]
1424	str	r1, [r0, #0x08]
1425	bx	lr
1426	LMEMCPY_C_PAD
1427
1428/*
1429 * 0011: dst is 32-bit aligned, src is 8-bit aligned
1430 */
1431	ldrb	r2, [r1]		/* r2 = ...0 */
1432	ldr	r3, [r1, #0x01]		/* BE:r3 = 1234  LE:r3 = 4321 */
1433	ldr	ip, [r1, #0x05]		/* BE:ip = 5678  LE:ip = 8765 */
1434	ldr	r1, [r1, #0x09]		/* BE:r1 = 9ABx  LE:r1 = xBA9 */
1435#ifdef __ARMEB__
1436	mov	r2, r2, lsl #24		/* r2 = 0... */
1437	orr	r2, r2, r3, lsr #8	/* r2 = 0123 */
1438	str	r2, [r0]
1439	mov	r3, r3, lsl #24		/* r3 = 4... */
1440	orr	r3, r3, ip, lsr #8	/* r3 = 4567 */
1441	mov	r1, r1, lsr #8		/* r1 = .9AB */
1442	orr	r1, r1, ip, lsl #24	/* r1 = 89AB */
1443#else
1444	orr	r2, r2, r3, lsl #8	/* r2 = 3210 */
1445	str	r2, [r0]
1446	mov	r3, r3, lsr #24		/* r3 = ...4 */
1447	orr	r3, r3, ip, lsl #8	/* r3 = 7654 */
1448	mov	r1, r1, lsl #8		/* r1 = BA9. */
1449	orr	r1, r1, ip, lsr #24	/* r1 = BA98 */
1450#endif
1451	str	r3, [r0, #0x04]
1452	str	r1, [r0, #0x08]
1453	bx	lr
1454	LMEMCPY_C_PAD
1455
1456/*
1457 * 0100: dst is 8-bit aligned (byte 1), src is 32-bit aligned
1458 */
1459	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
1460	ldr	r3, [r1, #0x04]		/* BE:r3 = 4567  LE:r3 = 7654 */
1461	ldr	ip, [r1, #0x08]		/* BE:ip = 89AB  LE:ip = BA98 */
1462	mov	r1, r2, lsr #8		/* BE:r1 = .012  LE:r1 = .321 */
1463	strh	r1, [r0, #0x01]
1464#ifdef __ARMEB__
1465	mov	r1, r2, lsr #24		/* r1 = ...0 */
1466	strb	r1, [r0]
1467	mov	r1, r2, lsl #24		/* r1 = 3... */
1468	orr	r2, r1, r3, lsr #8	/* r1 = 3456 */
1469	mov	r1, r3, lsl #24		/* r1 = 7... */
1470	orr	r1, r1, ip, lsr #8	/* r1 = 789A */
1471#else
1472	strb	r2, [r0]
1473	mov	r1, r2, lsr #24		/* r1 = ...3 */
1474	orr	r2, r1, r3, lsl #8	/* r1 = 6543 */
1475	mov	r1, r3, lsr #24		/* r1 = ...7 */
1476	orr	r1, r1, ip, lsl #8	/* r1 = A987 */
1477	mov	ip, ip, lsr #24		/* ip = ...B */
1478#endif
1479	str	r2, [r0, #0x03]
1480	str	r1, [r0, #0x07]
1481	strb	ip, [r0, #0x0b]
1482	bx	lr
1483	LMEMCPY_C_PAD
1484
1485/*
1486 * 0101: dst is 8-bit aligned (byte 1), src is 8-bit aligned (byte 1)
1487 */
1488	ldrb	r2, [r1]
1489	ldrh	r3, [r1, #0x01]
1490	ldr	ip, [r1, #0x03]
1491	strb	r2, [r0]
1492	ldr	r2, [r1, #0x07]
1493	ldrb	r1, [r1, #0x0b]
1494	strh	r3, [r0, #0x01]
1495	str	ip, [r0, #0x03]
1496	str	r2, [r0, #0x07]
1497	strb	r1, [r0, #0x0b]
1498	bx	lr
1499	LMEMCPY_C_PAD
1500
1501/*
1502 * 0110: dst is 8-bit aligned (byte 1), src is 16-bit aligned
1503 */
1504	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1505	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
1506	ldr	ip, [r1, #0x06]		/* BE:ip = 6789  LE:ip = 9876 */
1507	ldrh	r1, [r1, #0x0a]		/* BE:r1 = ..AB  LE:r1 = ..BA */
1508#ifdef __ARMEB__
1509	mov	r2, r2, ror #8		/* r2 = 1..0 */
1510	strb	r2, [r0]
1511	mov	r2, r2, lsr #16		/* r2 = ..1. */
1512	orr	r2, r2, r3, lsr #24	/* r2 = ..12 */
1513	strh	r2, [r0, #0x01]
1514	mov	r2, r3, lsl #8		/* r2 = 345. */
1515	orr	r3, r2, ip, lsr #24	/* r3 = 3456 */
1516	mov	r2, ip, lsl #8		/* r2 = 789. */
1517	orr	r2, r2, r1, lsr #8	/* r2 = 789A */
1518#else
1519	strb	r2, [r0]
1520	mov	r2, r2, lsr #8		/* r2 = ...1 */
1521	orr	r2, r2, r3, lsl #8	/* r2 = 4321 */
1522	strh	r2, [r0, #0x01]
1523	mov	r2, r3, lsr #8		/* r2 = .543 */
1524	orr	r3, r2, ip, lsl #24	/* r3 = 6543 */
1525	mov	r2, ip, lsr #8		/* r2 = .987 */
1526	orr	r2, r2, r1, lsl #24	/* r2 = A987 */
1527	mov	r1, r1, lsr #8		/* r1 = ...B */
1528#endif
1529	str	r3, [r0, #0x03]
1530	str	r2, [r0, #0x07]
1531	strb	r1, [r0, #0x0b]
1532	bx	lr
1533	LMEMCPY_C_PAD
1534
1535/*
1536 * 0111: dst is 8-bit aligned (byte 1), src is 8-bit aligned (byte 3)
1537 */
1538	ldrb	r2, [r1]
1539	ldr	r3, [r1, #0x01]		/* BE:r3 = 1234  LE:r3 = 4321 */
1540	ldr	ip, [r1, #0x05]		/* BE:ip = 5678  LE:ip = 8765 */
1541	ldr	r1, [r1, #0x09]		/* BE:r1 = 9ABx  LE:r1 = xBA9 */
1542	strb	r2, [r0]
1543#ifdef __ARMEB__
1544	mov	r2, r3, lsr #16		/* r2 = ..12 */
1545	strh	r2, [r0, #0x01]
1546	mov	r3, r3, lsl #16		/* r3 = 34.. */
1547	orr	r3, r3, ip, lsr #16	/* r3 = 3456 */
1548	mov	ip, ip, lsl #16		/* ip = 78.. */
1549	orr	ip, ip, r1, lsr #16	/* ip = 789A */
1550	mov	r1, r1, lsr #8		/* r1 = .9AB */
1551#else
1552	strh	r3, [r0, #0x01]
1553	mov	r3, r3, lsr #16		/* r3 = ..43 */
1554	orr	r3, r3, ip, lsl #16	/* r3 = 6543 */
1555	mov	ip, ip, lsr #16		/* ip = ..87 */
1556	orr	ip, ip, r1, lsl #16	/* ip = A987 */
1557	mov	r1, r1, lsr #16		/* r1 = ..xB */
1558#endif
1559	str	r3, [r0, #0x03]
1560	str	ip, [r0, #0x07]
1561	strb	r1, [r0, #0x0b]
1562	bx	lr
1563	LMEMCPY_C_PAD
1564
1565/*
1566 * 1000: dst is 16-bit aligned, src is 32-bit aligned
1567 */
1568	ldr	ip, [r1]		/* BE:ip = 0123  LE:ip = 3210 */
1569	ldr	r3, [r1, #0x04]		/* BE:r3 = 4567  LE:r3 = 7654 */
1570	ldr	r2, [r1, #0x08]		/* BE:r2 = 89AB  LE:r2 = BA98 */
1571	mov	r1, ip, lsr #16		/* BE:r1 = ..01  LE:r1 = ..32 */
1572#ifdef __ARMEB__
1573	strh	r1, [r0]
1574	mov	r1, ip, lsl #16		/* r1 = 23.. */
1575	orr	r1, r1, r3, lsr #16	/* r1 = 2345 */
1576	mov	r3, r3, lsl #16		/* r3 = 67.. */
1577	orr	r3, r3, r2, lsr #16	/* r3 = 6789 */
1578#else
1579	strh	ip, [r0]
1580	orr	r1, r1, r3, lsl #16	/* r1 = 5432 */
1581	mov	r3, r3, lsr #16		/* r3 = ..76 */
1582	orr	r3, r3, r2, lsl #16	/* r3 = 9876 */
1583	mov	r2, r2, lsr #16		/* r2 = ..BA */
1584#endif
1585	str	r1, [r0, #0x02]
1586	str	r3, [r0, #0x06]
1587	strh	r2, [r0, #0x0a]
1588	bx	lr
1589	LMEMCPY_C_PAD
1590
1591/*
1592 * 1001: dst is 16-bit aligned, src is 8-bit aligned (byte 1)
1593 */
1594	ldr	r2, [r1, #-1]		/* BE:r2 = x012  LE:r2 = 210x */
1595	ldr	r3, [r1, #0x03]		/* BE:r3 = 3456  LE:r3 = 6543 */
1596	mov	ip, r2, lsr #8		/* BE:ip = .x01  LE:ip = .210 */
1597	strh	ip, [r0]
1598	ldr	ip, [r1, #0x07]		/* BE:ip = 789A  LE:ip = A987 */
1599	ldrb	r1, [r1, #0x0b]		/* r1 = ...B */
1600#ifdef __ARMEB__
1601	mov	r2, r2, lsl #24		/* r2 = 2... */
1602	orr	r2, r2, r3, lsr #8	/* r2 = 2345 */
1603	mov	r3, r3, lsl #24		/* r3 = 6... */
1604	orr	r3, r3, ip, lsr #8	/* r3 = 6789 */
1605	orr	r1, r1, ip, lsl #8	/* r1 = 89AB */
1606#else
1607	mov	r2, r2, lsr #24		/* r2 = ...2 */
1608	orr	r2, r2, r3, lsl #8	/* r2 = 5432 */
1609	mov	r3, r3, lsr #24		/* r3 = ...6 */
1610	orr	r3, r3, ip, lsl #8	/* r3 = 9876 */
1611	mov	r1, r1, lsl #8		/* r1 = ..B. */
1612	orr	r1, r1, ip, lsr #24	/* r1 = ..BA */
1613#endif
1614	str	r2, [r0, #0x02]
1615	str	r3, [r0, #0x06]
1616	strh	r1, [r0, #0x0a]
1617	bx	lr
1618	LMEMCPY_C_PAD
1619
1620/*
1621 * 1010: dst is 16-bit aligned, src is 16-bit aligned
1622 */
1623	ldrh	r2, [r1]
1624	ldr	r3, [r1, #0x02]
1625	ldr	ip, [r1, #0x06]
1626	ldrh	r1, [r1, #0x0a]
1627	strh	r2, [r0]
1628	str	r3, [r0, #0x02]
1629	str	ip, [r0, #0x06]
1630	strh	r1, [r0, #0x0a]
1631	bx	lr
1632	LMEMCPY_C_PAD
1633
1634/*
1635 * 1011: dst is 16-bit aligned, src is 8-bit aligned (byte 3)
1636 */
1637	ldr	r2, [r1, #0x09]		/* BE:r2 = 9ABx  LE:r2 = xBA9 */
1638	ldr	r3, [r1, #0x05]		/* BE:r3 = 5678  LE:r3 = 8765 */
1639	mov	ip, r2, lsr #8		/* BE:ip = .9AB  LE:ip = .xBA */
1640	strh	ip, [r0, #0x0a]
1641	ldr	ip, [r1, #0x01]		/* BE:ip = 1234  LE:ip = 4321 */
1642	ldrb	r1, [r1]		/* r1 = ...0 */
1643#ifdef __ARMEB__
1644	mov	r2, r2, lsr #24		/* r2 = ...9 */
1645	orr	r2, r2, r3, lsl #8	/* r2 = 6789 */
1646	mov	r3, r3, lsr #24		/* r3 = ...5 */
1647	orr	r3, r3, ip, lsl #8	/* r3 = 2345 */
1648	mov	r1, r1, lsl #8		/* r1 = ..0. */
1649	orr	r1, r1, ip, lsr #24	/* r1 = ..01 */
1650#else
1651	mov	r2, r2, lsl #24		/* r2 = 9... */
1652	orr	r2, r2, r3, lsr #8	/* r2 = 9876 */
1653	mov	r3, r3, lsl #24		/* r3 = 5... */
1654	orr	r3, r3, ip, lsr #8	/* r3 = 5432 */
1655	orr	r1, r1, ip, lsl #8	/* r1 = 3210 */
1656#endif
1657	str	r2, [r0, #0x06]
1658	str	r3, [r0, #0x02]
1659	strh	r1, [r0]
1660	bx	lr
1661	LMEMCPY_C_PAD
1662
1663/*
1664 * 1100: dst is 8-bit aligned (byte 3), src is 32-bit aligned
1665 */
1666	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
1667	ldr	ip, [r1, #0x04]		/* BE:ip = 4567  LE:ip = 7654 */
1668	ldr	r1, [r1, #0x08]		/* BE:r1 = 89AB  LE:r1 = BA98 */
1669#ifdef __ARMEB__
1670	mov	r3, r2, lsr #24		/* r3 = ...0 */
1671	strb	r3, [r0]
1672	mov	r2, r2, lsl #8		/* r2 = 123. */
1673	orr	r2, r2, ip, lsr #24	/* r2 = 1234 */
1674	str	r2, [r0, #0x01]
1675	mov	r2, ip, lsl #8		/* r2 = 567. */
1676	orr	r2, r2, r1, lsr #24	/* r2 = 5678 */
1677	str	r2, [r0, #0x05]
1678	mov	r2, r1, lsr #8		/* r2 = ..9A */
1679	strh	r2, [r0, #0x09]
1680	strb	r1, [r0, #0x0b]
1681#else
1682	strb	r2, [r0]
1683	mov	r3, r2, lsr #8		/* r3 = .321 */
1684	orr	r3, r3, ip, lsl #24	/* r3 = 4321 */
1685	str	r3, [r0, #0x01]
1686	mov	r3, ip, lsr #8		/* r3 = .765 */
1687	orr	r3, r3, r1, lsl #24	/* r3 = 8765 */
1688	str	r3, [r0, #0x05]
1689	mov	r1, r1, lsr #8		/* r1 = .BA9 */
1690	strh	r1, [r0, #0x09]
1691	mov	r1, r1, lsr #16		/* r1 = ...B */
1692	strb	r1, [r0, #0x0b]
1693#endif
1694	bx	lr
1695	LMEMCPY_C_PAD
1696
1697/*
1698 * 1101: dst is 8-bit aligned (byte 3), src is 8-bit aligned (byte 1)
1699 */
1700	ldrb	r2, [r1, #0x0b]		/* r2 = ...B */
1701	ldr	r3, [r1, #0x07]		/* BE:r3 = 789A  LE:r3 = A987 */
1702	ldr	ip, [r1, #0x03]		/* BE:ip = 3456  LE:ip = 6543 */
1703	ldr	r1, [r1, #-1]		/* BE:r1 = x012  LE:r1 = 210x */
1704	strb	r2, [r0, #0x0b]
1705#ifdef __ARMEB__
1706	strh	r3, [r0, #0x09]
1707	mov	r3, r3, lsr #16		/* r3 = ..78 */
1708	orr	r3, r3, ip, lsl #16	/* r3 = 5678 */
1709	mov	ip, ip, lsr #16		/* ip = ..34 */
1710	orr	ip, ip, r1, lsl #16	/* ip = 1234 */
1711	mov	r1, r1, lsr #16		/* r1 = ..x0 */
1712#else
1713	mov	r2, r3, lsr #16		/* r2 = ..A9 */
1714	strh	r2, [r0, #0x09]
1715	mov	r3, r3, lsl #16		/* r3 = 87.. */
1716	orr	r3, r3, ip, lsr #16	/* r3 = 8765 */
1717	mov	ip, ip, lsl #16		/* ip = 43.. */
1718	orr	ip, ip, r1, lsr #16	/* ip = 4321 */
1719	mov	r1, r1, lsr #8		/* r1 = .210 */
1720#endif
1721	str	r3, [r0, #0x05]
1722	str	ip, [r0, #0x01]
1723	strb	r1, [r0]
1724	bx	lr
1725	LMEMCPY_C_PAD
1726
1727/*
1728 * 1110: dst is 8-bit aligned (byte 3), src is 16-bit aligned
1729 */
1730#ifdef __ARMEB__
1731	ldrh	r2, [r1, #0x0a]		/* r2 = ..AB */
1732	ldr	ip, [r1, #0x06]		/* ip = 6789 */
1733	ldr	r3, [r1, #0x02]		/* r3 = 2345 */
1734	ldrh	r1, [r1]		/* r1 = ..01 */
1735	strb	r2, [r0, #0x0b]
1736	mov	r2, r2, lsr #8		/* r2 = ...A */
1737	orr	r2, r2, ip, lsl #8	/* r2 = 789A */
1738	mov	ip, ip, lsr #8		/* ip = .678 */
1739	orr	ip, ip, r3, lsl #24	/* ip = 5678 */
1740	mov	r3, r3, lsr #8		/* r3 = .234 */
1741	orr	r3, r3, r1, lsl #24	/* r3 = 1234 */
1742	mov	r1, r1, lsr #8		/* r1 = ...0 */
1743	strb	r1, [r0]
1744	str	r3, [r0, #0x01]
1745	str	ip, [r0, #0x05]
1746	strh	r2, [r0, #0x09]
1747#else
1748	ldrh	r2, [r1]		/* r2 = ..10 */
1749	ldr	r3, [r1, #0x02]		/* r3 = 5432 */
1750	ldr	ip, [r1, #0x06]		/* ip = 9876 */
1751	ldrh	r1, [r1, #0x0a]		/* r1 = ..BA */
1752	strb	r2, [r0]
1753	mov	r2, r2, lsr #8		/* r2 = ...1 */
1754	orr	r2, r2, r3, lsl #8	/* r2 = 4321 */
1755	mov	r3, r3, lsr #24		/* r3 = ...5 */
1756	orr	r3, r3, ip, lsl #8	/* r3 = 8765 */
1757	mov	ip, ip, lsr #24		/* ip = ...9 */
1758	orr	ip, ip, r1, lsl #8	/* ip = .BA9 */
1759	mov	r1, r1, lsr #8		/* r1 = ...B */
1760	str	r2, [r0, #0x01]
1761	str	r3, [r0, #0x05]
1762	strh	ip, [r0, #0x09]
1763	strb	r1, [r0, #0x0b]
1764#endif
1765	bx	lr
1766	LMEMCPY_C_PAD
1767
1768/*
1769 * 1111: dst is 8-bit aligned (byte 3), src is 8-bit aligned (byte 3)
1770 */
1771	ldrb	r2, [r1]
1772	ldr	r3, [r1, #0x01]
1773	ldr	ip, [r1, #0x05]
1774	strb	r2, [r0]
1775	ldrh	r2, [r1, #0x09]
1776	ldrb	r1, [r1, #0x0b]
1777	str	r3, [r0, #0x01]
1778	str	ip, [r0, #0x05]
1779	strh	r2, [r0, #0x09]
1780	strb	r1, [r0, #0x0b]
1781	bx	lr
1782#endif	/* !_STANDALONE */
1783