xref: /onnv-gate/usr/src/lib/libc/amd64/gen/memcpy.s (revision 7298:b69e27387f74)
10Sstevel@tonic-gate/*
26320Sbholler * CDDL HEADER START
36320Sbholler *
46320Sbholler * The contents of this file are subject to the terms of the
56320Sbholler * Common Development and Distribution License (the "License").
66320Sbholler * You may not use this file except in compliance with the License.
76320Sbholler *
86320Sbholler * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
96320Sbholler * or http://www.opensolaris.org/os/licensing.
106320Sbholler * See the License for the specific language governing permissions
116320Sbholler * and limitations under the License.
126320Sbholler *
136320Sbholler * When distributing Covered Code, include this CDDL HEADER in each
146320Sbholler * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
156320Sbholler * If applicable, add the following below this CDDL HEADER, with the
166320Sbholler * fields enclosed by brackets "[]" replaced with your own identifying
176320Sbholler * information: Portions Copyright [yyyy] [name of copyright owner]
186320Sbholler *
196320Sbholler * CDDL HEADER END
200Sstevel@tonic-gate */
210Sstevel@tonic-gate
220Sstevel@tonic-gate/*
236812Sraf * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
246812Sraf * Use is subject to license terms.
256812Sraf */
266812Sraf
276812Sraf/*
286320Sbholler * Copyright (c) 2008, Intel Corporation
290Sstevel@tonic-gate * All rights reserved.
300Sstevel@tonic-gate */
310Sstevel@tonic-gate
326320Sbholler/*
336320Sbholler * memcpy.s - copies two blocks of memory
346320Sbholler *	Implements memcpy() and memmove() libc primitives.
356320Sbholler */
366812Sraf
37*7298SMark.J.Nelson@Sun.COM	.file	"memcpy.s"
380Sstevel@tonic-gate
390Sstevel@tonic-gate#include <sys/asm_linkage.h>
406812Sraf
410Sstevel@tonic-gate	ANSI_PRAGMA_WEAK(memmove,function)
420Sstevel@tonic-gate	ANSI_PRAGMA_WEAK(memcpy,function)
430Sstevel@tonic-gate
440Sstevel@tonic-gate#include "cache.h"
456320Sbholler#include "proc64_id.h"
460Sstevel@tonic-gate
476320Sbholler#define L(s) .memcpy/**/s
486320Sbholler
496320Sbholler/*
506320Sbholler * memcpy algorithm overview:
516320Sbholler *
526320Sbholler * Thresholds used below were determined experimentally.
536320Sbholler *
546320Sbholler * Pseudo code:
556320Sbholler *
566320Sbholler * If (size <= 128 bytes) {
576320Sbholler *	do unrolled code (primarily 8-byte loads/stores) regardless of
586320Sbholler *	alignment.
596320Sbholler * } else {
606320Sbholler *	Align destination to 16-byte boundary
616320Sbholler *
626320Sbholler *      if (NO_SSE) {
636320Sbholler *		If (size > half of the largest level cache) {
646320Sbholler *			Use 8-byte non-temporal stores (64-bytes/loop)
656320Sbholler *		} else {
666320Sbholler *			if (size > 4K && size <= half l1 cache size) {
676320Sbholler *				Use rep movsq
686320Sbholler *			} else {
696320Sbholler *				Use 8-byte loads/stores (64 bytes per loop)
706320Sbholler *			}
716320Sbholler *		}
726320Sbholler *
736320Sbholler *	} else { **USE SSE**
746320Sbholler *		If (size > half of the largest level cache) {
756320Sbholler *			Use 16-byte non-temporal stores (128-bytes per loop)
766320Sbholler *		} else {
776320Sbholler *			If (both source and destination are aligned) {
786320Sbholler *			    Use 16-byte aligned loads and stores (128 bytes/loop)
796320Sbholler *			} else {
806320Sbholler *			    use pairs of xmm registers with SSE2 or SSSE3
816320Sbholler *			    instructions to concatenate and shift appropriately
826320Sbholler *			    to account for source unalignment. This enables
836320Sbholler *			    16-byte aligned loads to be done.
846320Sbholler *			}
856320Sbholler *		}
866320Sbholler	}
876320Sbholler *
886320Sbholler *	Finish any remaining bytes via unrolled code above.
896320Sbholler * }
906320Sbholler *
916320Sbholler * memmove overview:
926320Sbholler *	memmove is the same as memcpy except one case where copy needs to be
936320Sbholler *	done backwards. The copy backwards code is done in a similar manner.
946320Sbholler */
956320Sbholler
966320Sbholler	ENTRY(memmove)
976320Sbholler	cmp	%rsi,%rdi		# if dst <= src
986320Sbholler	jbe	L(CopyForward)		# then do copy forward
996320Sbholler	mov	%rsi,%r9		# move src to r9
1006320Sbholler	add	%rdx,%r9		# add len to get addr of end of src
1016320Sbholler	cmp	%r9,%rdi		# if dst < end of src
1026320Sbholler	jb	L(CopyBackwards)	# then do copy backwards
1036320Sbholler	jmp	L(CopyForward)
1046320Sbholler
1056320Sbholler	ENTRY (memcpy)
1066320SbhollerL(CopyForward):
1076320Sbholler	mov    %rdx,%r8
1086320Sbholler	mov    %rdi,%rcx
1096320Sbholler	mov    %rsi,%rdx
1106320Sbholler	mov    %rdi,%rax
1116320Sbholler	lea    L(fwdPxQx)(%rip),%r11
1126320Sbholler	cmp    $0x80,%r8		# 128
1136320Sbholler	jg     L(ck_use_sse2)
1146320Sbholler	add    %r8,%rcx
1156320Sbholler	add    %r8,%rdx
1166320Sbholler
1176320Sbholler	movslq (%r11,%r8,4),%r10
1186320Sbholler	lea    (%r10,%r11,1),%r11
1196320Sbholler	jmpq   *%r11
1206320Sbholler
1216320Sbholler	.balign 16
1226320SbhollerL(ShrtAlignNew):
1236320Sbholler	lea    L(AliPxQx)(%rip),%r11
1246320Sbholler	mov    %rcx,%r9
1256320Sbholler	and    $0xf,%r9
1266320Sbholler
1276320Sbholler	movslq (%r11,%r9,4),%r10
1286320Sbholler	lea    (%r10,%r11,1),%r11
1296320Sbholler	jmpq   *%r11
1306320Sbholler
1316320Sbholler	.balign 16
1326320SbhollerL(fwdPxQx): .int       L(P0Q0)-L(fwdPxQx)
1336320Sbholler           .int        L(P1Q0)-L(fwdPxQx)
1346320Sbholler           .int        L(P2Q0)-L(fwdPxQx)
1356320Sbholler           .int        L(P3Q0)-L(fwdPxQx)
1366320Sbholler           .int        L(P4Q0)-L(fwdPxQx)
1376320Sbholler           .int        L(P5Q0)-L(fwdPxQx)
1386320Sbholler           .int        L(P6Q0)-L(fwdPxQx)
1396320Sbholler           .int        L(P7Q0)-L(fwdPxQx)
1406320Sbholler
1416320Sbholler           .int        L(P0Q1)-L(fwdPxQx)
1426320Sbholler           .int        L(P1Q1)-L(fwdPxQx)
1436320Sbholler           .int        L(P2Q1)-L(fwdPxQx)
1446320Sbholler           .int        L(P3Q1)-L(fwdPxQx)
1456320Sbholler           .int        L(P4Q1)-L(fwdPxQx)
1466320Sbholler           .int        L(P5Q1)-L(fwdPxQx)
1476320Sbholler           .int        L(P6Q1)-L(fwdPxQx)
1486320Sbholler           .int        L(P7Q1)-L(fwdPxQx)
1496320Sbholler
1506320Sbholler           .int        L(P0Q2)-L(fwdPxQx)
1516320Sbholler           .int        L(P1Q2)-L(fwdPxQx)
1526320Sbholler           .int        L(P2Q2)-L(fwdPxQx)
1536320Sbholler           .int        L(P3Q2)-L(fwdPxQx)
1546320Sbholler           .int        L(P4Q2)-L(fwdPxQx)
1556320Sbholler           .int        L(P5Q2)-L(fwdPxQx)
1566320Sbholler           .int        L(P6Q2)-L(fwdPxQx)
1576320Sbholler           .int        L(P7Q2)-L(fwdPxQx)
1586320Sbholler
1596320Sbholler           .int        L(P0Q3)-L(fwdPxQx)
1606320Sbholler           .int        L(P1Q3)-L(fwdPxQx)
1616320Sbholler           .int        L(P2Q3)-L(fwdPxQx)
1626320Sbholler           .int        L(P3Q3)-L(fwdPxQx)
1636320Sbholler           .int        L(P4Q3)-L(fwdPxQx)
1646320Sbholler           .int        L(P5Q3)-L(fwdPxQx)
1656320Sbholler           .int        L(P6Q3)-L(fwdPxQx)
1666320Sbholler           .int        L(P7Q3)-L(fwdPxQx)
1676320Sbholler
1686320Sbholler           .int        L(P0Q4)-L(fwdPxQx)
1696320Sbholler           .int        L(P1Q4)-L(fwdPxQx)
1706320Sbholler           .int        L(P2Q4)-L(fwdPxQx)
1716320Sbholler           .int        L(P3Q4)-L(fwdPxQx)
1726320Sbholler           .int        L(P4Q4)-L(fwdPxQx)
1736320Sbholler           .int        L(P5Q4)-L(fwdPxQx)
1746320Sbholler           .int        L(P6Q4)-L(fwdPxQx)
1756320Sbholler           .int        L(P7Q4)-L(fwdPxQx)
1766320Sbholler
1776320Sbholler           .int        L(P0Q5)-L(fwdPxQx)
1786320Sbholler           .int        L(P1Q5)-L(fwdPxQx)
1796320Sbholler           .int        L(P2Q5)-L(fwdPxQx)
1806320Sbholler           .int        L(P3Q5)-L(fwdPxQx)
1816320Sbholler           .int        L(P4Q5)-L(fwdPxQx)
1826320Sbholler           .int        L(P5Q5)-L(fwdPxQx)
1836320Sbholler           .int        L(P6Q5)-L(fwdPxQx)
1846320Sbholler           .int        L(P7Q5)-L(fwdPxQx)
1856320Sbholler
1866320Sbholler           .int        L(P0Q6)-L(fwdPxQx)
1876320Sbholler           .int        L(P1Q6)-L(fwdPxQx)
1886320Sbholler           .int        L(P2Q6)-L(fwdPxQx)
1896320Sbholler           .int        L(P3Q6)-L(fwdPxQx)
1906320Sbholler           .int        L(P4Q6)-L(fwdPxQx)
1916320Sbholler           .int        L(P5Q6)-L(fwdPxQx)
1926320Sbholler           .int        L(P6Q6)-L(fwdPxQx)
1936320Sbholler           .int        L(P7Q6)-L(fwdPxQx)
1946320Sbholler
1956320Sbholler           .int        L(P0Q7)-L(fwdPxQx)
1966320Sbholler           .int        L(P1Q7)-L(fwdPxQx)
1976320Sbholler           .int        L(P2Q7)-L(fwdPxQx)
1986320Sbholler           .int        L(P3Q7)-L(fwdPxQx)
1996320Sbholler           .int        L(P4Q7)-L(fwdPxQx)
2006320Sbholler           .int        L(P5Q7)-L(fwdPxQx)
2016320Sbholler           .int        L(P6Q7)-L(fwdPxQx)
2026320Sbholler           .int        L(P7Q7)-L(fwdPxQx)
2036320Sbholler
2046320Sbholler           .int        L(P0Q8)-L(fwdPxQx)
2056320Sbholler           .int        L(P1Q8)-L(fwdPxQx)
2066320Sbholler           .int        L(P2Q8)-L(fwdPxQx)
2076320Sbholler           .int        L(P3Q8)-L(fwdPxQx)
2086320Sbholler           .int        L(P4Q8)-L(fwdPxQx)
2096320Sbholler           .int        L(P5Q8)-L(fwdPxQx)
2106320Sbholler           .int        L(P6Q8)-L(fwdPxQx)
2116320Sbholler           .int        L(P7Q8)-L(fwdPxQx)
2126320Sbholler
2136320Sbholler           .int        L(P0Q9)-L(fwdPxQx)
2146320Sbholler           .int        L(P1Q9)-L(fwdPxQx)
2156320Sbholler           .int        L(P2Q9)-L(fwdPxQx)
2166320Sbholler           .int        L(P3Q9)-L(fwdPxQx)
2176320Sbholler           .int        L(P4Q9)-L(fwdPxQx)
2186320Sbholler           .int        L(P5Q9)-L(fwdPxQx)
2196320Sbholler           .int        L(P6Q9)-L(fwdPxQx)
2206320Sbholler           .int        L(P7Q9)-L(fwdPxQx)
2216320Sbholler
2226320Sbholler           .int        L(P0QA)-L(fwdPxQx)
2236320Sbholler           .int        L(P1QA)-L(fwdPxQx)
2246320Sbholler           .int        L(P2QA)-L(fwdPxQx)
2256320Sbholler           .int        L(P3QA)-L(fwdPxQx)
2266320Sbholler           .int        L(P4QA)-L(fwdPxQx)
2276320Sbholler           .int        L(P5QA)-L(fwdPxQx)
2286320Sbholler           .int        L(P6QA)-L(fwdPxQx)
2296320Sbholler           .int        L(P7QA)-L(fwdPxQx)
2306320Sbholler
2316320Sbholler           .int        L(P0QB)-L(fwdPxQx)
2326320Sbholler           .int        L(P1QB)-L(fwdPxQx)
2336320Sbholler           .int        L(P2QB)-L(fwdPxQx)
2346320Sbholler           .int        L(P3QB)-L(fwdPxQx)
2356320Sbholler           .int        L(P4QB)-L(fwdPxQx)
2366320Sbholler           .int        L(P5QB)-L(fwdPxQx)
2376320Sbholler           .int        L(P6QB)-L(fwdPxQx)
2386320Sbholler           .int        L(P7QB)-L(fwdPxQx)
2396320Sbholler
2406320Sbholler           .int        L(P0QC)-L(fwdPxQx)
2416320Sbholler           .int        L(P1QC)-L(fwdPxQx)
2426320Sbholler           .int        L(P2QC)-L(fwdPxQx)
2436320Sbholler           .int        L(P3QC)-L(fwdPxQx)
2446320Sbholler           .int        L(P4QC)-L(fwdPxQx)
2456320Sbholler           .int        L(P5QC)-L(fwdPxQx)
2466320Sbholler           .int        L(P6QC)-L(fwdPxQx)
2476320Sbholler           .int        L(P7QC)-L(fwdPxQx)
2486320Sbholler
2496320Sbholler           .int        L(P0QD)-L(fwdPxQx)
2506320Sbholler           .int        L(P1QD)-L(fwdPxQx)
2516320Sbholler           .int        L(P2QD)-L(fwdPxQx)
2526320Sbholler           .int        L(P3QD)-L(fwdPxQx)
2536320Sbholler           .int        L(P4QD)-L(fwdPxQx)
2546320Sbholler           .int        L(P5QD)-L(fwdPxQx)
2556320Sbholler           .int        L(P6QD)-L(fwdPxQx)
2566320Sbholler           .int        L(P7QD)-L(fwdPxQx)
2576320Sbholler
2586320Sbholler           .int        L(P0QE)-L(fwdPxQx)
2596320Sbholler           .int        L(P1QE)-L(fwdPxQx)
2606320Sbholler           .int        L(P2QE)-L(fwdPxQx)
2616320Sbholler           .int        L(P3QE)-L(fwdPxQx)
2626320Sbholler           .int        L(P4QE)-L(fwdPxQx)
2636320Sbholler           .int        L(P5QE)-L(fwdPxQx)
2646320Sbholler           .int        L(P6QE)-L(fwdPxQx)
2656320Sbholler           .int        L(P7QE)-L(fwdPxQx)
2666320Sbholler
2676320Sbholler           .int        L(P0QF)-L(fwdPxQx)
2686320Sbholler           .int        L(P1QF)-L(fwdPxQx)
2696320Sbholler           .int        L(P2QF)-L(fwdPxQx)
2706320Sbholler           .int        L(P3QF)-L(fwdPxQx)
2716320Sbholler           .int        L(P4QF)-L(fwdPxQx)
2726320Sbholler           .int        L(P5QF)-L(fwdPxQx)
2736320Sbholler           .int        L(P6QF)-L(fwdPxQx)
2746320Sbholler           .int        L(P7QF)-L(fwdPxQx)
2756320Sbholler
2766320Sbholler           .int        L(P0QG)-L(fwdPxQx)	# 0x80
2776320Sbholler
2786320Sbholler	   .balign 16
2796320SbhollerL(AliPxQx): .int   L(now_qw_aligned)-L(AliPxQx)
2806320Sbholler           .int        L(A1Q0)-L(AliPxQx)
2816320Sbholler           .int        L(A2Q0)-L(AliPxQx)
2826320Sbholler           .int        L(A3Q0)-L(AliPxQx)
2836320Sbholler           .int        L(A4Q0)-L(AliPxQx)
2846320Sbholler           .int        L(A5Q0)-L(AliPxQx)
2856320Sbholler           .int        L(A6Q0)-L(AliPxQx)
2866320Sbholler           .int        L(A7Q0)-L(AliPxQx)
2876320Sbholler           .int        L(A0Q1)-L(AliPxQx)
2886320Sbholler           .int        L(A1Q1)-L(AliPxQx)
2896320Sbholler           .int        L(A2Q1)-L(AliPxQx)
2906320Sbholler           .int        L(A3Q1)-L(AliPxQx)
2916320Sbholler           .int        L(A4Q1)-L(AliPxQx)
2926320Sbholler           .int        L(A5Q1)-L(AliPxQx)
2936320Sbholler           .int        L(A6Q1)-L(AliPxQx)
2946320Sbholler           .int        L(A7Q1)-L(AliPxQx)
2956320Sbholler
2966320Sbholler	.balign 16
2976320SbhollerL(A1Q0):			# ; need to move 8+ 7=1+2+4 bytes
2986320Sbholler	movzbq (%rdx),%r11
2996320Sbholler	sub    $0xf,%r8
3006320Sbholler	mov    %r11b,(%rcx)
3016320Sbholler
3026320Sbholler	movzwq 0x1(%rdx),%r10
3036320Sbholler	mov    %r10w,0x1(%rcx)
3046320Sbholler
3056320Sbholler	mov    0x3(%rdx),%r9d
3066320Sbholler	mov    %r9d,0x3(%rcx)
3076320Sbholler
3086320Sbholler	mov    0x7(%rdx),%r11
3096320Sbholler	add    $0xf,%rdx
3106320Sbholler	mov    %r11,0x7(%rcx)
3116320Sbholler
3126320Sbholler	add    $0xf,%rcx
3136320Sbholler	jmp    L(now_qw_aligned)
3146320Sbholler
3156320Sbholler	.balign 16
3166320SbhollerL(A2Q0):			# ; need to move 8+ 6=2+4 bytes
3176320Sbholler	movzwq (%rdx),%r10
3186320Sbholler	sub    $0xe,%r8
3196320Sbholler	mov    %r10w,(%rcx)
3206320Sbholler
3216320Sbholler	mov    0x2(%rdx),%r9d
3226320Sbholler	mov    %r9d,0x2(%rcx)
3236320Sbholler
3246320Sbholler	mov    0x6(%rdx),%r11
3256320Sbholler	add    $0xe,%rdx
3266320Sbholler	mov    %r11,0x6(%rcx)
3276320Sbholler	add    $0xe,%rcx
3286320Sbholler	jmp    L(now_qw_aligned)
3296320Sbholler
3306320Sbholler	.balign 16
3316320SbhollerL(A3Q0):			# ; need to move 8+ 5=1+4 bytes
3326320Sbholler	movzbq (%rdx),%r11
3336320Sbholler	sub    $0xd,%r8
3346320Sbholler	mov    %r11b,(%rcx)
3356320Sbholler
3366320Sbholler	mov    0x1(%rdx),%r9d
3376320Sbholler	mov    %r9d,0x1(%rcx)
3386320Sbholler
3396320Sbholler	mov    0x5(%rdx),%r10
3406320Sbholler	add    $0xd,%rdx
3416320Sbholler	mov    %r10,0x5(%rcx)
3426320Sbholler
3436320Sbholler	add    $0xd,%rcx
3446320Sbholler	jmp    L(now_qw_aligned)
3456320Sbholler
3466320Sbholler	.balign 16
3476320SbhollerL(A4Q0):			# ; need to move 8+4 bytes
3486320Sbholler	mov    (%rdx),%r9d
3496320Sbholler	sub    $0xc,%r8
3506320Sbholler	mov    %r9d,(%rcx)
3516320Sbholler
3526320Sbholler	mov    0x4(%rdx),%r10
3536320Sbholler	add    $0xc,%rdx
3546320Sbholler	mov    %r10,0x4(%rcx)
3556320Sbholler
3566320Sbholler	add    $0xc,%rcx
3576320Sbholler	jmp    L(now_qw_aligned)
3586320Sbholler
3596320Sbholler	.balign 16
3606320SbhollerL(A5Q0):			# ; need to move 8+ 3=1+2 bytes
3616320Sbholler	movzbq (%rdx),%r11
3626320Sbholler	sub    $0xb,%r8
3636320Sbholler	mov    %r11b,(%rcx)
3646320Sbholler
3656320Sbholler	movzwq 0x1(%rdx),%r10
3666320Sbholler	mov    %r10w,0x1(%rcx)
3676320Sbholler
3686320Sbholler	mov    0x3(%rdx),%r9
3696320Sbholler	add    $0xb,%rdx
3706320Sbholler	mov    %r9,0x3(%rcx)
3716320Sbholler
3726320Sbholler	add    $0xb,%rcx
3736320Sbholler	jmp    L(now_qw_aligned)
3746320Sbholler
3756320Sbholler	.balign 16
3766320SbhollerL(A6Q0):			# ; need to move 8+2 bytes
3776320Sbholler	movzwq (%rdx),%r10
3786320Sbholler	sub    $0xa,%r8
3796320Sbholler	mov    %r10w,(%rcx)
3806320Sbholler
3816320Sbholler	mov    0x2(%rdx),%r9
3826320Sbholler	add    $0xa,%rdx
3836320Sbholler	mov    %r9,0x2(%rcx)
3846320Sbholler
3856320Sbholler	add    $0xa,%rcx
3866320Sbholler	jmp    L(now_qw_aligned)
3876320Sbholler
3886320Sbholler	.balign 16
3896320SbhollerL(A7Q0):			# ; need to move 8+1 byte
3906320Sbholler	movzbq (%rdx),%r11
3916320Sbholler	sub    $0x9,%r8
3926320Sbholler	mov    %r11b,(%rcx)
3936320Sbholler
3946320Sbholler	mov    0x1(%rdx),%r10
3956320Sbholler	add    $0x9,%rdx
3966320Sbholler	mov    %r10,0x1(%rcx)
3976320Sbholler
3986320Sbholler	add    $0x9,%rcx
3996320Sbholler	jmp    L(now_qw_aligned)
4006320Sbholler
4016320Sbholler	.balign 16
4026320SbhollerL(A0Q1):			# ; need to move 8 bytes
4036320Sbholler
4046320Sbholler	mov    (%rdx),%r10
4056320Sbholler	add    $0x8,%rdx
4066320Sbholler	sub    $0x8,%r8
4076320Sbholler	mov    %r10,(%rcx)
4086320Sbholler
4096320Sbholler	add    $0x8,%rcx
4106320Sbholler	jmp    L(now_qw_aligned)
4116320Sbholler
4126320Sbholler	.balign 16
4136320SbhollerL(A1Q1):			# ; need to move 7=1+2+4 bytes
4146320Sbholler	movzbq (%rdx),%r11
4156320Sbholler	sub    $0x7,%r8
4166320Sbholler	mov    %r11b,(%rcx)
4176320Sbholler
4186320Sbholler	movzwq 0x1(%rdx),%r10
4196320Sbholler	mov    %r10w,0x1(%rcx)
4206320Sbholler
4216320Sbholler	mov    0x3(%rdx),%r9d
4226320Sbholler	add    $0x7,%rdx
4236320Sbholler	mov    %r9d,0x3(%rcx)
4246320Sbholler	add    $0x7,%rcx
4256320Sbholler	jmp    L(now_qw_aligned)
4266320Sbholler
4276320Sbholler	.balign 16
4286320SbhollerL(A2Q1):			# ; need to move 6=2+4 bytes
4296320Sbholler	movzwq (%rdx),%r10
4306320Sbholler	sub    $0x6,%r8
4316320Sbholler	mov    %r10w,(%rcx)
4326320Sbholler	mov    0x2(%rdx),%r9d
4336320Sbholler	add    $0x6,%rdx
4346320Sbholler	mov    %r9d,0x2(%rcx)
4356320Sbholler	add    $0x6,%rcx
4366320Sbholler	jmp    L(now_qw_aligned)
4376320Sbholler
4386320Sbholler	.balign 16
4396320SbhollerL(A3Q1):			# ; need to move 5=1+4 bytes
4406320Sbholler	movzbq (%rdx),%r11
4416320Sbholler	sub    $0x5,%r8
4426320Sbholler	mov    %r11b,(%rcx)
4436320Sbholler	mov    0x1(%rdx),%r9d
4446320Sbholler	add    $0x5,%rdx
4456320Sbholler	mov    %r9d,0x1(%rcx)
4466320Sbholler	add    $0x5,%rcx
4476320Sbholler	jmp    L(now_qw_aligned)
4486320Sbholler
4496320Sbholler	.balign 16
4506320SbhollerL(A4Q1):			# ; need to move 4 bytes
4516320Sbholler	mov    (%rdx),%r9d
4526320Sbholler	sub    $0x4,%r8
4536320Sbholler	add    $0x4,%rdx
4546320Sbholler	mov    %r9d,(%rcx)
4556320Sbholler	add    $0x4,%rcx
4566320Sbholler	jmp    L(now_qw_aligned)
4576320Sbholler
4586320Sbholler	.balign 16
4596320SbhollerL(A5Q1):			# ; need to move 3=1+2 bytes
4606320Sbholler	movzbq (%rdx),%r11
4616320Sbholler	sub    $0x3,%r8
4626320Sbholler	mov    %r11b,(%rcx)
4636320Sbholler
4646320Sbholler	movzwq 0x1(%rdx),%r10
4656320Sbholler	add    $0x3,%rdx
4666320Sbholler	mov    %r10w,0x1(%rcx)
4676320Sbholler
4686320Sbholler	add    $0x3,%rcx
4696320Sbholler	jmp    L(now_qw_aligned)
4706320Sbholler
4716320Sbholler	.balign 16
4726320SbhollerL(A6Q1):			# ; need to move 2 bytes
4736320Sbholler	movzwq (%rdx),%r10
4746320Sbholler	sub    $0x2,%r8
4756320Sbholler	add    $0x2,%rdx
4766320Sbholler	mov    %r10w,(%rcx)
4776320Sbholler	add    $0x2,%rcx
4786320Sbholler	jmp    L(now_qw_aligned)
4796320Sbholler
4806320Sbholler	.balign 16
4816320SbhollerL(A7Q1):			# ; need to move 1 byte
4826320Sbholler	movzbq (%rdx),%r11
4836320Sbholler	dec    %r8
4846320Sbholler	inc    %rdx
4856320Sbholler	mov    %r11b,(%rcx)
4866320Sbholler	inc    %rcx
4876320Sbholler	jmp    L(now_qw_aligned)
4886320Sbholler
4896320Sbholler
4906320Sbholler	.balign 16
4916320SbhollerL(P0QG):
4926320Sbholler	mov    -0x80(%rdx),%r9
4936320Sbholler	mov    %r9,-0x80(%rcx)
4946320SbhollerL(P0QF):
4956320Sbholler	mov    -0x78(%rdx),%r10
4966320Sbholler	mov    %r10,-0x78(%rcx)
4976320SbhollerL(P0QE):
4986320Sbholler	mov    -0x70(%rdx),%r9
4996320Sbholler	mov    %r9,-0x70(%rcx)
5006320SbhollerL(P0QD):
5016320Sbholler	mov    -0x68(%rdx),%r10
5026320Sbholler	mov    %r10,-0x68(%rcx)
5036320SbhollerL(P0QC):
5046320Sbholler	mov    -0x60(%rdx),%r9
5056320Sbholler	mov    %r9,-0x60(%rcx)
5066320SbhollerL(P0QB):
5076320Sbholler	mov    -0x58(%rdx),%r10
5086320Sbholler	mov    %r10,-0x58(%rcx)
5096320SbhollerL(P0QA):
5106320Sbholler	mov    -0x50(%rdx),%r9
5116320Sbholler	mov    %r9,-0x50(%rcx)
5126320SbhollerL(P0Q9):
5136320Sbholler	mov    -0x48(%rdx),%r10
5146320Sbholler	mov    %r10,-0x48(%rcx)
5156320SbhollerL(P0Q8):
5166320Sbholler	mov    -0x40(%rdx),%r9
5176320Sbholler	mov    %r9,-0x40(%rcx)
5186320SbhollerL(P0Q7):
5196320Sbholler	mov    -0x38(%rdx),%r10
5206320Sbholler	mov    %r10,-0x38(%rcx)
5216320SbhollerL(P0Q6):
5226320Sbholler	mov    -0x30(%rdx),%r9
5236320Sbholler	mov    %r9,-0x30(%rcx)
5246320SbhollerL(P0Q5):
5256320Sbholler	mov    -0x28(%rdx),%r10
5266320Sbholler	mov    %r10,-0x28(%rcx)
5276320SbhollerL(P0Q4):
5286320Sbholler	mov    -0x20(%rdx),%r9
5296320Sbholler	mov    %r9,-0x20(%rcx)
5306320SbhollerL(P0Q3):
5316320Sbholler	mov    -0x18(%rdx),%r10
5326320Sbholler	mov    %r10,-0x18(%rcx)
5336320SbhollerL(P0Q2):
5346320Sbholler	mov    -0x10(%rdx),%r9
5356320Sbholler	mov    %r9,-0x10(%rcx)
5366320SbhollerL(P0Q1):
5376320Sbholler	mov    -0x8(%rdx),%r10
5386320Sbholler	mov    %r10,-0x8(%rcx)
5396320SbhollerL(P0Q0):
5406320Sbholler	ret
5416320Sbholler
5426320Sbholler	.balign 16
5436320SbhollerL(P1QF):
5446320Sbholler	mov    -0x79(%rdx),%r9
5456320Sbholler	mov    %r9,-0x79(%rcx)
5466320SbhollerL(P1QE):
5476320Sbholler	mov    -0x71(%rdx),%r11
5486320Sbholler	mov    %r11,-0x71(%rcx)
5496320SbhollerL(P1QD):
5506320Sbholler	mov    -0x69(%rdx),%r10
5516320Sbholler	mov    %r10,-0x69(%rcx)
5526320SbhollerL(P1QC):
5536320Sbholler	mov    -0x61(%rdx),%r9
5546320Sbholler	mov    %r9,-0x61(%rcx)
5556320SbhollerL(P1QB):
5566320Sbholler	mov    -0x59(%rdx),%r11
5576320Sbholler	mov    %r11,-0x59(%rcx)
5586320SbhollerL(P1QA):
5596320Sbholler	mov    -0x51(%rdx),%r10
5606320Sbholler	mov    %r10,-0x51(%rcx)
5616320SbhollerL(P1Q9):
5626320Sbholler	mov    -0x49(%rdx),%r9
5636320Sbholler	mov    %r9,-0x49(%rcx)
5646320SbhollerL(P1Q8):
5656320Sbholler	mov    -0x41(%rdx),%r11
5666320Sbholler	mov    %r11,-0x41(%rcx)
5676320SbhollerL(P1Q7):
5686320Sbholler	mov    -0x39(%rdx),%r10
5696320Sbholler	mov    %r10,-0x39(%rcx)
5706320SbhollerL(P1Q6):
5716320Sbholler	mov    -0x31(%rdx),%r9
5726320Sbholler	mov    %r9,-0x31(%rcx)
5736320SbhollerL(P1Q5):
5746320Sbholler	mov    -0x29(%rdx),%r11
5756320Sbholler	mov    %r11,-0x29(%rcx)
5766320SbhollerL(P1Q4):
5776320Sbholler	mov    -0x21(%rdx),%r10
5786320Sbholler	mov    %r10,-0x21(%rcx)
5796320SbhollerL(P1Q3):
5806320Sbholler	mov    -0x19(%rdx),%r9
5816320Sbholler	mov    %r9,-0x19(%rcx)
5826320SbhollerL(P1Q2):
5836320Sbholler	mov    -0x11(%rdx),%r11
5846320Sbholler	mov    %r11,-0x11(%rcx)
5856320SbhollerL(P1Q1):
5866320Sbholler	mov    -0x9(%rdx),%r10
5876320Sbholler	mov    %r10,-0x9(%rcx)
5886320SbhollerL(P1Q0):
5896320Sbholler	movzbq -0x1(%rdx),%r9
5906320Sbholler	mov    %r9b,-0x1(%rcx)
5916320Sbholler	ret
5926320Sbholler
5936320Sbholler	.balign 16
5946320SbhollerL(P2QF):
5956320Sbholler	mov    -0x7a(%rdx),%r9
5966320Sbholler	mov    %r9,-0x7a(%rcx)
5976320SbhollerL(P2QE):
5986320Sbholler	mov    -0x72(%rdx),%r11
5996320Sbholler	mov    %r11,-0x72(%rcx)
6006320SbhollerL(P2QD):
6016320Sbholler	mov    -0x6a(%rdx),%r10
6026320Sbholler	mov    %r10,-0x6a(%rcx)
6036320SbhollerL(P2QC):
6046320Sbholler	mov    -0x62(%rdx),%r9
6056320Sbholler	mov    %r9,-0x62(%rcx)
6066320SbhollerL(P2QB):
6076320Sbholler	mov    -0x5a(%rdx),%r11
6086320Sbholler	mov    %r11,-0x5a(%rcx)
6096320SbhollerL(P2QA):
6106320Sbholler	mov    -0x52(%rdx),%r10
6116320Sbholler	mov    %r10,-0x52(%rcx)
6126320SbhollerL(P2Q9):
6136320Sbholler	mov    -0x4a(%rdx),%r9
6146320Sbholler	mov    %r9,-0x4a(%rcx)
6156320SbhollerL(P2Q8):
6166320Sbholler	mov    -0x42(%rdx),%r11
6176320Sbholler	mov    %r11,-0x42(%rcx)
6186320SbhollerL(P2Q7):
6196320Sbholler	mov    -0x3a(%rdx),%r10
6206320Sbholler	mov    %r10,-0x3a(%rcx)
6216320SbhollerL(P2Q6):
6226320Sbholler	mov    -0x32(%rdx),%r9
6236320Sbholler	mov    %r9,-0x32(%rcx)
6246320SbhollerL(P2Q5):
6256320Sbholler	mov    -0x2a(%rdx),%r11
6266320Sbholler	mov    %r11,-0x2a(%rcx)
6276320SbhollerL(P2Q4):
6286320Sbholler	mov    -0x22(%rdx),%r10
6296320Sbholler	mov    %r10,-0x22(%rcx)
6306320SbhollerL(P2Q3):
6316320Sbholler	mov    -0x1a(%rdx),%r9
6326320Sbholler	mov    %r9,-0x1a(%rcx)
6336320SbhollerL(P2Q2):
6346320Sbholler	mov    -0x12(%rdx),%r11
6356320Sbholler	mov    %r11,-0x12(%rcx)
6366320SbhollerL(P2Q1):
6376320Sbholler	mov    -0xa(%rdx),%r10
6386320Sbholler	mov    %r10,-0xa(%rcx)
6396320SbhollerL(P2Q0):
6406320Sbholler	movzwq -0x2(%rdx),%r9
6416320Sbholler	mov    %r9w,-0x2(%rcx)
6426320Sbholler	ret
6436320Sbholler
6446320Sbholler	.balign 16
6456320SbhollerL(P3QF):
6466320Sbholler	mov    -0x7b(%rdx),%r9
6476320Sbholler	mov    %r9,-0x7b(%rcx)
6486320SbhollerL(P3QE):
6496320Sbholler	mov    -0x73(%rdx),%r11
6506320Sbholler	mov    %r11,-0x73(%rcx)
6516320SbhollerL(P3QD):
6526320Sbholler	mov    -0x6b(%rdx),%r10
6536320Sbholler	mov    %r10,-0x6b(%rcx)
6546320SbhollerL(P3QC):
6556320Sbholler	mov    -0x63(%rdx),%r9
6566320Sbholler	mov    %r9,-0x63(%rcx)
6576320SbhollerL(P3QB):
6586320Sbholler	mov    -0x5b(%rdx),%r11
6596320Sbholler	mov    %r11,-0x5b(%rcx)
6606320SbhollerL(P3QA):
6616320Sbholler	mov    -0x53(%rdx),%r10
6626320Sbholler	mov    %r10,-0x53(%rcx)
6636320SbhollerL(P3Q9):
6646320Sbholler	mov    -0x4b(%rdx),%r9
6656320Sbholler	mov    %r9,-0x4b(%rcx)
6666320SbhollerL(P3Q8):
6676320Sbholler	mov    -0x43(%rdx),%r11
6686320Sbholler	mov    %r11,-0x43(%rcx)
6696320SbhollerL(P3Q7):
6706320Sbholler	mov    -0x3b(%rdx),%r10
6716320Sbholler	mov    %r10,-0x3b(%rcx)
6726320SbhollerL(P3Q6):
6736320Sbholler	mov    -0x33(%rdx),%r9
6746320Sbholler	mov    %r9,-0x33(%rcx)
6756320SbhollerL(P3Q5):
6766320Sbholler	mov    -0x2b(%rdx),%r11
6776320Sbholler	mov    %r11,-0x2b(%rcx)
6786320SbhollerL(P3Q4):
6796320Sbholler	mov    -0x23(%rdx),%r10
6806320Sbholler	mov    %r10,-0x23(%rcx)
6816320SbhollerL(P3Q3):
6826320Sbholler	mov    -0x1b(%rdx),%r9
6836320Sbholler	mov    %r9,-0x1b(%rcx)
6846320SbhollerL(P3Q2):
6856320Sbholler	mov    -0x13(%rdx),%r11
6866320Sbholler	mov    %r11,-0x13(%rcx)
6876320SbhollerL(P3Q1):
6886320Sbholler	mov    -0xb(%rdx),%r10
6896320Sbholler	mov    %r10,-0xb(%rcx)
6906320Sbholler	/*
6916320Sbholler	 * These trailing loads/stores have to do all their loads 1st,
6926320Sbholler	 * then do the stores.
6936320Sbholler	 */
6946320SbhollerL(P3Q0):
6956320Sbholler	movzwq -0x3(%rdx),%r9
6966320Sbholler	movzbq -0x1(%rdx),%r10
6976320Sbholler	mov    %r9w,-0x3(%rcx)
6986320Sbholler	mov    %r10b,-0x1(%rcx)
6996320Sbholler	ret
7006320Sbholler
7016320Sbholler	.balign 16
7026320SbhollerL(P4QF):
7036320Sbholler	mov    -0x7c(%rdx),%r9
7046320Sbholler	mov    %r9,-0x7c(%rcx)
7056320SbhollerL(P4QE):
7066320Sbholler	mov    -0x74(%rdx),%r11
7076320Sbholler	mov    %r11,-0x74(%rcx)
7086320SbhollerL(P4QD):
7096320Sbholler	mov    -0x6c(%rdx),%r10
7106320Sbholler	mov    %r10,-0x6c(%rcx)
7116320SbhollerL(P4QC):
7126320Sbholler	mov    -0x64(%rdx),%r9
7136320Sbholler	mov    %r9,-0x64(%rcx)
7146320SbhollerL(P4QB):
7156320Sbholler	mov    -0x5c(%rdx),%r11
7166320Sbholler	mov    %r11,-0x5c(%rcx)
7176320SbhollerL(P4QA):
7186320Sbholler	mov    -0x54(%rdx),%r10
7196320Sbholler	mov    %r10,-0x54(%rcx)
7206320SbhollerL(P4Q9):
7216320Sbholler	mov    -0x4c(%rdx),%r9
7226320Sbholler	mov    %r9,-0x4c(%rcx)
7236320SbhollerL(P4Q8):
7246320Sbholler	mov    -0x44(%rdx),%r11
7256320Sbholler	mov    %r11,-0x44(%rcx)
7266320SbhollerL(P4Q7):
7276320Sbholler	mov    -0x3c(%rdx),%r10
7286320Sbholler	mov    %r10,-0x3c(%rcx)
7296320SbhollerL(P4Q6):
7306320Sbholler	mov    -0x34(%rdx),%r9
7316320Sbholler	mov    %r9,-0x34(%rcx)
7326320SbhollerL(P4Q5):
7336320Sbholler	mov    -0x2c(%rdx),%r11
7346320Sbholler	mov    %r11,-0x2c(%rcx)
7356320SbhollerL(P4Q4):
7366320Sbholler	mov    -0x24(%rdx),%r10
7376320Sbholler	mov    %r10,-0x24(%rcx)
7386320SbhollerL(P4Q3):
7396320Sbholler	mov    -0x1c(%rdx),%r9
7406320Sbholler	mov    %r9,-0x1c(%rcx)
7416320SbhollerL(P4Q2):
7426320Sbholler	mov    -0x14(%rdx),%r11
7436320Sbholler	mov    %r11,-0x14(%rcx)
7446320SbhollerL(P4Q1):
7456320Sbholler	mov    -0xc(%rdx),%r10
7466320Sbholler	mov    %r10,-0xc(%rcx)
7476320SbhollerL(P4Q0):
7486320Sbholler	mov    -0x4(%rdx),%r9d
7496320Sbholler	mov    %r9d,-0x4(%rcx)
7506320Sbholler	ret
7516320Sbholler
7526320Sbholler	.balign 16
7536320SbhollerL(P5QF):
7546320Sbholler	mov    -0x7d(%rdx),%r9
7556320Sbholler	mov    %r9,-0x7d(%rcx)
7566320SbhollerL(P5QE):
7576320Sbholler	mov    -0x75(%rdx),%r11
7586320Sbholler	mov    %r11,-0x75(%rcx)
7596320SbhollerL(P5QD):
7606320Sbholler	mov    -0x6d(%rdx),%r10
7616320Sbholler	mov    %r10,-0x6d(%rcx)
7626320SbhollerL(P5QC):
7636320Sbholler	mov    -0x65(%rdx),%r9
7646320Sbholler	mov    %r9,-0x65(%rcx)
7656320SbhollerL(P5QB):
7666320Sbholler	mov    -0x5d(%rdx),%r11
7676320Sbholler	mov    %r11,-0x5d(%rcx)
7686320SbhollerL(P5QA):
7696320Sbholler	mov    -0x55(%rdx),%r10
7706320Sbholler	mov    %r10,-0x55(%rcx)
7716320SbhollerL(P5Q9):
7726320Sbholler	mov    -0x4d(%rdx),%r9
7736320Sbholler	mov    %r9,-0x4d(%rcx)
7746320SbhollerL(P5Q8):
7756320Sbholler	mov    -0x45(%rdx),%r11
7766320Sbholler	mov    %r11,-0x45(%rcx)
7776320SbhollerL(P5Q7):
7786320Sbholler	mov    -0x3d(%rdx),%r10
7796320Sbholler	mov    %r10,-0x3d(%rcx)
7806320SbhollerL(P5Q6):
7816320Sbholler	mov    -0x35(%rdx),%r9
7826320Sbholler	mov    %r9,-0x35(%rcx)
7836320SbhollerL(P5Q5):
7846320Sbholler	mov    -0x2d(%rdx),%r11
7856320Sbholler	mov    %r11,-0x2d(%rcx)
7866320SbhollerL(P5Q4):
7876320Sbholler	mov    -0x25(%rdx),%r10
7886320Sbholler	mov    %r10,-0x25(%rcx)
7896320SbhollerL(P5Q3):
7906320Sbholler	mov    -0x1d(%rdx),%r9
7916320Sbholler	mov    %r9,-0x1d(%rcx)
7926320SbhollerL(P5Q2):
7936320Sbholler	mov    -0x15(%rdx),%r11
7946320Sbholler	mov    %r11,-0x15(%rcx)
7956320SbhollerL(P5Q1):
7966320Sbholler	mov    -0xd(%rdx),%r10
7976320Sbholler	mov    %r10,-0xd(%rcx)
7986320Sbholler	/*
7996320Sbholler	 * These trailing loads/stores have to do all their loads 1st,
8006320Sbholler	 * then do the stores.
8016320Sbholler	 */
8026320SbhollerL(P5Q0):
8036320Sbholler	mov    -0x5(%rdx),%r9d
8046320Sbholler	movzbq -0x1(%rdx),%r10
8056320Sbholler	mov    %r9d,-0x5(%rcx)
8066320Sbholler	mov    %r10b,-0x1(%rcx)
8076320Sbholler	ret
8086320Sbholler
8096320Sbholler	.balign 16
8106320SbhollerL(P6QF):
8116320Sbholler	mov    -0x7e(%rdx),%r9
8126320Sbholler	mov    %r9,-0x7e(%rcx)
8136320SbhollerL(P6QE):
8146320Sbholler	mov    -0x76(%rdx),%r11
8156320Sbholler	mov    %r11,-0x76(%rcx)
8166320SbhollerL(P6QD):
8176320Sbholler	mov    -0x6e(%rdx),%r10
8186320Sbholler	mov    %r10,-0x6e(%rcx)
8196320SbhollerL(P6QC):
8206320Sbholler	mov    -0x66(%rdx),%r9
8216320Sbholler	mov    %r9,-0x66(%rcx)
8226320SbhollerL(P6QB):
8236320Sbholler	mov    -0x5e(%rdx),%r11
8246320Sbholler	mov    %r11,-0x5e(%rcx)
8256320SbhollerL(P6QA):
8266320Sbholler	mov    -0x56(%rdx),%r10
8276320Sbholler	mov    %r10,-0x56(%rcx)
8286320SbhollerL(P6Q9):
8296320Sbholler	mov    -0x4e(%rdx),%r9
8306320Sbholler	mov    %r9,-0x4e(%rcx)
8316320SbhollerL(P6Q8):
8326320Sbholler	mov    -0x46(%rdx),%r11
8336320Sbholler	mov    %r11,-0x46(%rcx)
8346320SbhollerL(P6Q7):
8356320Sbholler	mov    -0x3e(%rdx),%r10
8366320Sbholler	mov    %r10,-0x3e(%rcx)
8376320SbhollerL(P6Q6):
8386320Sbholler	mov    -0x36(%rdx),%r9
8396320Sbholler	mov    %r9,-0x36(%rcx)
8406320SbhollerL(P6Q5):
8416320Sbholler	mov    -0x2e(%rdx),%r11
8426320Sbholler	mov    %r11,-0x2e(%rcx)
8436320SbhollerL(P6Q4):
8446320Sbholler	mov    -0x26(%rdx),%r10
8456320Sbholler	mov    %r10,-0x26(%rcx)
8466320SbhollerL(P6Q3):
8476320Sbholler	mov    -0x1e(%rdx),%r9
8486320Sbholler	mov    %r9,-0x1e(%rcx)
8496320SbhollerL(P6Q2):
8506320Sbholler	mov    -0x16(%rdx),%r11
8516320Sbholler	mov    %r11,-0x16(%rcx)
8526320SbhollerL(P6Q1):
8536320Sbholler	mov    -0xe(%rdx),%r10
8546320Sbholler	mov    %r10,-0xe(%rcx)
8556320Sbholler	/*
8566320Sbholler	 * These trailing loads/stores have to do all their loads 1st,
8576320Sbholler	 * then do the stores.
8586320Sbholler	 */
8596320SbhollerL(P6Q0):
8606320Sbholler	mov    -0x6(%rdx),%r9d
8616320Sbholler	movzwq -0x2(%rdx),%r10
8626320Sbholler	mov    %r9d,-0x6(%rcx)
8636320Sbholler	mov    %r10w,-0x2(%rcx)
8646320Sbholler	ret
8656320Sbholler
8666320Sbholler	.balign 16
8676320SbhollerL(P7QF):
8686320Sbholler	mov    -0x7f(%rdx),%r9
8696320Sbholler	mov    %r9,-0x7f(%rcx)
8706320SbhollerL(P7QE):
8716320Sbholler	mov    -0x77(%rdx),%r11
8726320Sbholler	mov    %r11,-0x77(%rcx)
8736320SbhollerL(P7QD):
8746320Sbholler	mov    -0x6f(%rdx),%r10
8756320Sbholler	mov    %r10,-0x6f(%rcx)
8766320SbhollerL(P7QC):
8776320Sbholler	mov    -0x67(%rdx),%r9
8786320Sbholler	mov    %r9,-0x67(%rcx)
8796320SbhollerL(P7QB):
8806320Sbholler	mov    -0x5f(%rdx),%r11
8816320Sbholler	mov    %r11,-0x5f(%rcx)
8826320SbhollerL(P7QA):
8836320Sbholler	mov    -0x57(%rdx),%r10
8846320Sbholler	mov    %r10,-0x57(%rcx)
8856320SbhollerL(P7Q9):
8866320Sbholler	mov    -0x4f(%rdx),%r9
8876320Sbholler	mov    %r9,-0x4f(%rcx)
8886320SbhollerL(P7Q8):
8896320Sbholler	mov    -0x47(%rdx),%r11
8906320Sbholler	mov    %r11,-0x47(%rcx)
8916320SbhollerL(P7Q7):
8926320Sbholler	mov    -0x3f(%rdx),%r10
8936320Sbholler	mov    %r10,-0x3f(%rcx)
8946320SbhollerL(P7Q6):
8956320Sbholler	mov    -0x37(%rdx),%r9
8966320Sbholler	mov    %r9,-0x37(%rcx)
8976320SbhollerL(P7Q5):
8986320Sbholler	mov    -0x2f(%rdx),%r11
8996320Sbholler	mov    %r11,-0x2f(%rcx)
9006320SbhollerL(P7Q4):
9016320Sbholler	mov    -0x27(%rdx),%r10
9026320Sbholler	mov    %r10,-0x27(%rcx)
9036320SbhollerL(P7Q3):
9046320Sbholler	mov    -0x1f(%rdx),%r9
9056320Sbholler	mov    %r9,-0x1f(%rcx)
9066320SbhollerL(P7Q2):
9076320Sbholler	mov    -0x17(%rdx),%r11
9086320Sbholler	mov    %r11,-0x17(%rcx)
9096320SbhollerL(P7Q1):
9106320Sbholler	mov    -0xf(%rdx),%r10
9116320Sbholler	mov    %r10,-0xf(%rcx)
9126320Sbholler	/*
9136320Sbholler	 * These trailing loads/stores have to do all their loads 1st,
9146320Sbholler	 * then do the stores.
9156320Sbholler	 */
9166320SbhollerL(P7Q0):
9176320Sbholler	mov    -0x7(%rdx),%r9d
9186320Sbholler	movzwq -0x3(%rdx),%r10
9196320Sbholler	movzbq -0x1(%rdx),%r11
9206320Sbholler	mov    %r9d,-0x7(%rcx)
9216320Sbholler	mov    %r10w,-0x3(%rcx)
9226320Sbholler	mov    %r11b,-0x1(%rcx)
9236320Sbholler	ret
9246320Sbholler
9256320Sbholler	.balign 16
9266320SbhollerL(ck_use_sse2):
9276320Sbholler	/*
9286320Sbholler	 * Align dest to 16 byte boundary.
9296320Sbholler	 */
9306320Sbholler	test   $0xf,%rcx
9316320Sbholler	jnz    L(ShrtAlignNew)
9326320Sbholler
9336320SbhollerL(now_qw_aligned):
9346320Sbholler	cmpl   $NO_SSE,.memops_method(%rip)
9356320Sbholler	je     L(Loop8byte_pre)
9366320Sbholler
9376320Sbholler	/*
9386320Sbholler	 * The fall-through path is to do SSE2 16-byte load/stores
9396320Sbholler	 */
9406320Sbholler
9416320Sbholler	/*
9426320Sbholler	 * If current move size is larger than half of the highest level cache
9436320Sbholler	 * size, then do non-temporal moves.
9446320Sbholler	 */
9456320Sbholler	mov    .largest_level_cache_size(%rip),%r9d
9466320Sbholler	shr    %r9		# take half of it
9476320Sbholler	cmp    %r9,%r8
9486320Sbholler	jg     L(sse2_nt_move)
9496320Sbholler
9506320Sbholler	/*
9516320Sbholler	 * If both the source and dest are aligned, then use the both aligned
9526320Sbholler	 * logic. Well aligned data should reap the rewards.
9536320Sbholler	 */
9546320Sbholler	test   $0xf,%rdx
9556320Sbholler	jz     L(pre_both_aligned)
9566320Sbholler
9576320Sbholler	lea    L(SSE_src)(%rip),%r10		# SSE2 (default)
9586320Sbholler	testl  $USE_SSSE3,.memops_method(%rip)
9596320Sbholler	jz     1f
9606320Sbholler	lea    L(SSSE3_src)(%rip),%r10		# SSSE3
9616320Sbholler
9626320Sbholler1:
9636320Sbholler	/*
9646320Sbholler	 * if the src is not 16 byte aligned...
9656320Sbholler	 */
9666320Sbholler	mov    %rdx,%r11
9676320Sbholler	and    $0xf,%r11
9686320Sbholler	movdqu (%rdx),%xmm0
9696320Sbholler	movdqa %xmm0,(%rcx)
9706320Sbholler	add    $0x10,%rdx
9716320Sbholler	sub    %r11,%rdx
9726320Sbholler	add    $0x10,%rcx
9736320Sbholler	sub    $0x10,%r8
9746320Sbholler	movdqa (%rdx),%xmm1
9756320Sbholler
9766320Sbholler	movslq (%r10,%r11,4),%r9
9776320Sbholler	lea    (%r9,%r10,1),%r10
9786320Sbholler	jmpq   *%r10
9796320Sbholler
9806320Sbholler	    .balign 16
9816320SbhollerL(SSSE3_src): .int	L(pre_both_aligned)-L(SSSE3_src)
9826320Sbholler	    .int        L(mov3dqa1) -L(SSSE3_src)
9836320Sbholler	    .int        L(mov3dqa2) -L(SSSE3_src)
9846320Sbholler	    .int        L(mov3dqa3) -L(SSSE3_src)
9856320Sbholler	    .int        L(mov3dqa4) -L(SSSE3_src)
9866320Sbholler	    .int        L(mov3dqa5) -L(SSSE3_src)
9876320Sbholler	    .int        L(mov3dqa6) -L(SSSE3_src)
9886320Sbholler	    .int        L(mov3dqa7) -L(SSSE3_src)
9896320Sbholler	    .int        L(movdqa8)  -L(SSSE3_src)
9906320Sbholler	    .int        L(mov3dqa9) -L(SSSE3_src)
9916320Sbholler	    .int        L(mov3dqa10)-L(SSSE3_src)
9926320Sbholler	    .int        L(mov3dqa11)-L(SSSE3_src)
9936320Sbholler	    .int        L(mov3dqa12)-L(SSSE3_src)
9946320Sbholler	    .int        L(mov3dqa13)-L(SSSE3_src)
9956320Sbholler	    .int        L(mov3dqa14)-L(SSSE3_src)
9966320Sbholler	    .int        L(mov3dqa15)-L(SSSE3_src)
9976320SbhollerL(SSE_src): .int    L(pre_both_aligned)-L(SSE_src)
9986320Sbholler	    .int        L(movdqa1) -L(SSE_src)
9996320Sbholler	    .int        L(movdqa2) -L(SSE_src)
10006320Sbholler	    .int        L(movdqa3) -L(SSE_src)
10016320Sbholler	    .int        L(movdqa4) -L(SSE_src)
10026320Sbholler	    .int        L(movdqa5) -L(SSE_src)
10036320Sbholler	    .int        L(movdqa6) -L(SSE_src)
10046320Sbholler	    .int        L(movdqa7) -L(SSE_src)
10056320Sbholler	    .int        L(movdqa8) -L(SSE_src)
10066320Sbholler	    .int        L(movdqa9) -L(SSE_src)
10076320Sbholler	    .int        L(movdqa10)-L(SSE_src)
10086320Sbholler	    .int        L(movdqa11)-L(SSE_src)
10096320Sbholler	    .int        L(movdqa12)-L(SSE_src)
10106320Sbholler	    .int        L(movdqa13)-L(SSE_src)
10116320Sbholler	    .int        L(movdqa14)-L(SSE_src)
10126320Sbholler	    .int        L(movdqa15)-L(SSE_src)
10136320Sbholler
10146320Sbholler	.balign 16
10156320SbhollerL(movdqa1):
10166320Sbholler	movdqa 0x10(%rdx),%xmm3 # load the upper source buffer
10176320Sbholler	movdqa 0x20(%rdx),%xmm0 # load the upper source buffer
10186320Sbholler	lea    0x20(%rdx),%rdx
10196320Sbholler	lea    -0x20(%r8),%r8
10206320Sbholler
10216320Sbholler	psrldq $0x1,%xmm1  # shift right prev buffer (saved from last iteration)
10226320Sbholler	movdqa %xmm3,%xmm2 # store off xmm reg for use next iteration
10236320Sbholler	pslldq $0xf,%xmm3  # shift the current buffer left (shift in zeros)
10246320Sbholler	por    %xmm1,%xmm3 # OR them together
10256320Sbholler	cmp    $0x20,%r8
10266320Sbholler
10276320Sbholler	psrldq $0x1,%xmm2  # shift right prev buffer (saved from last iteration)
10286320Sbholler	movdqa %xmm0,%xmm1 # store off xmm reg for use next iteration
10296320Sbholler	pslldq $0xf,%xmm0  # shift the current buffer left (shift in zeros)
10306320Sbholler	por    %xmm2,%xmm0 # OR them together
10316320Sbholler	movdqa %xmm3,(%rcx)     # store it
10326320Sbholler	movdqa %xmm0,0x10(%rcx) # store it
10336320Sbholler	lea    0x20(%rcx),%rcx
10346320Sbholler
10356320Sbholler	jge    L(movdqa1)
10366320Sbholler	jmp    L(movdqa_epi)
10376320Sbholler
10386320Sbholler	.balign 16
10396320SbhollerL(movdqa2):
10406320Sbholler	sub    $0x20,%r8
10416320Sbholler	movdqa 0x10(%rdx),%xmm3
10426320Sbholler	movdqa 0x20(%rdx),%xmm0
10436320Sbholler	add    $0x20,%rdx
10446320Sbholler
10456320Sbholler	psrldq $0x2,%xmm1
10466320Sbholler	movdqa %xmm3,%xmm2
10476320Sbholler	pslldq $0xe,%xmm3
10486320Sbholler	por    %xmm1,%xmm3
10496320Sbholler
10506320Sbholler	psrldq $0x2,%xmm2
10516320Sbholler	movdqa %xmm0,%xmm1
10526320Sbholler	pslldq $0xe,%xmm0
10536320Sbholler	por    %xmm2,%xmm0
10546320Sbholler	movdqa %xmm3,(%rcx)
10556320Sbholler	movdqa %xmm0,0x10(%rcx)
10566320Sbholler
10576320Sbholler	add    $0x20,%rcx
10586320Sbholler	cmp    $0x20,%r8
10596320Sbholler	jge    L(movdqa2)
10606320Sbholler	jmp    L(movdqa_epi)
10616320Sbholler
10626320Sbholler	.balign 16
10636320SbhollerL(movdqa3):
10646320Sbholler	sub    $0x20,%r8
10656320Sbholler	movdqa 0x10(%rdx),%xmm3
10666320Sbholler	movdqa 0x20(%rdx),%xmm0
10676320Sbholler	add    $0x20,%rdx
10686320Sbholler
10696320Sbholler	psrldq $0x3,%xmm1
10706320Sbholler	movdqa %xmm3,%xmm2
10716320Sbholler	pslldq $0xd,%xmm3
10726320Sbholler	por    %xmm1,%xmm3
10736320Sbholler
10746320Sbholler	psrldq $0x3,%xmm2
10756320Sbholler	movdqa %xmm0,%xmm1
10766320Sbholler	pslldq $0xd,%xmm0
10776320Sbholler	por    %xmm2,%xmm0
10786320Sbholler	movdqa %xmm3,(%rcx)
10796320Sbholler	movdqa %xmm0,0x10(%rcx)
10806320Sbholler
10816320Sbholler	add    $0x20,%rcx
10826320Sbholler	cmp    $0x20,%r8
10836320Sbholler	jge    L(movdqa3)
10846320Sbholler	jmp    L(movdqa_epi)
10856320Sbholler
10866320Sbholler	.balign 16
10876320SbhollerL(movdqa4):
10886320Sbholler	sub    $0x20,%r8
10896320Sbholler	movdqa 0x10(%rdx),%xmm3
10906320Sbholler	movdqa 0x20(%rdx),%xmm0
10916320Sbholler	add    $0x20,%rdx
10926320Sbholler
10936320Sbholler	psrldq $0x4,%xmm1
10946320Sbholler	movdqa %xmm3,%xmm2
10956320Sbholler	pslldq $0xc,%xmm3
10966320Sbholler	por    %xmm1,%xmm3
10976320Sbholler
10986320Sbholler	psrldq $0x4,%xmm2
10996320Sbholler	movdqa %xmm0,%xmm1
11006320Sbholler	pslldq $0xc,%xmm0
11016320Sbholler	por    %xmm2,%xmm0
11026320Sbholler
11036320Sbholler	movdqa %xmm3,(%rcx)
11046320Sbholler	movdqa %xmm0,0x10(%rcx)
11056320Sbholler
11066320Sbholler	add    $0x20,%rcx
11076320Sbholler	cmp    $0x20,%r8
11086320Sbholler	jge    L(movdqa4)
11096320Sbholler	jmp    L(movdqa_epi)
11106320Sbholler
11116320Sbholler	.balign 16
11126320SbhollerL(movdqa5):
11136320Sbholler	sub    $0x20,%r8
11146320Sbholler	movdqa 0x10(%rdx),%xmm3
11156320Sbholler	movdqa 0x20(%rdx),%xmm0
11166320Sbholler	add    $0x20,%rdx
11176320Sbholler
11186320Sbholler	psrldq $0x5,%xmm1
11196320Sbholler	movdqa %xmm3,%xmm2
11206320Sbholler	pslldq $0xb,%xmm3
11216320Sbholler	por    %xmm1,%xmm3
11226320Sbholler
11236320Sbholler	psrldq $0x5,%xmm2
11246320Sbholler	movdqa %xmm0,%xmm1
11256320Sbholler	pslldq $0xb,%xmm0
11266320Sbholler	por    %xmm2,%xmm0
11276320Sbholler
11286320Sbholler	movdqa %xmm3,(%rcx)
11296320Sbholler	movdqa %xmm0,0x10(%rcx)
11306320Sbholler
11316320Sbholler	add    $0x20,%rcx
11326320Sbholler	cmp    $0x20,%r8
11336320Sbholler	jge    L(movdqa5)
11346320Sbholler	jmp    L(movdqa_epi)
11356320Sbholler
11366320Sbholler	.balign 16
11376320SbhollerL(movdqa6):
11386320Sbholler	sub    $0x20,%r8
11396320Sbholler	movdqa 0x10(%rdx),%xmm3
11406320Sbholler	movdqa 0x20(%rdx),%xmm0
11416320Sbholler	add    $0x20,%rdx
11426320Sbholler
11436320Sbholler	psrldq $0x6,%xmm1
11446320Sbholler	movdqa %xmm3,%xmm2
11456320Sbholler	pslldq $0xa,%xmm3
11466320Sbholler	por    %xmm1,%xmm3
11476320Sbholler
11486320Sbholler	psrldq $0x6,%xmm2
11496320Sbholler	movdqa %xmm0,%xmm1
11506320Sbholler	pslldq $0xa,%xmm0
11516320Sbholler	por    %xmm2,%xmm0
11526320Sbholler	movdqa %xmm3,(%rcx)
11536320Sbholler	movdqa %xmm0,0x10(%rcx)
11546320Sbholler
11556320Sbholler	add    $0x20,%rcx
11566320Sbholler	cmp    $0x20,%r8
11576320Sbholler	jge    L(movdqa6)
11586320Sbholler	jmp    L(movdqa_epi)
11596320Sbholler
11606320Sbholler	.balign 16
11616320SbhollerL(movdqa7):
11626320Sbholler	sub    $0x20,%r8
11636320Sbholler	movdqa 0x10(%rdx),%xmm3
11646320Sbholler	movdqa 0x20(%rdx),%xmm0
11656320Sbholler	add    $0x20,%rdx
11666320Sbholler
11676320Sbholler	psrldq $0x7,%xmm1
11686320Sbholler	movdqa %xmm3,%xmm2
11696320Sbholler	pslldq $0x9,%xmm3
11706320Sbholler	por    %xmm1,%xmm3
11716320Sbholler
11726320Sbholler	psrldq $0x7,%xmm2
11736320Sbholler	movdqa %xmm0,%xmm1
11746320Sbholler	pslldq $0x9,%xmm0
11756320Sbholler	por    %xmm2,%xmm0
11766320Sbholler	movdqa %xmm3,(%rcx)
11776320Sbholler	movdqa %xmm0,0x10(%rcx)
11786320Sbholler
11796320Sbholler	add    $0x20,%rcx
11806320Sbholler	cmp    $0x20,%r8
11816320Sbholler	jge    L(movdqa7)
11826320Sbholler	jmp    L(movdqa_epi)
11836320Sbholler
11846320Sbholler	.balign 16
11856320SbhollerL(movdqa8):
11866320Sbholler	movdqa 0x10(%rdx),%xmm3
11876320Sbholler	sub    $0x30,%r8
11886320Sbholler	movdqa 0x20(%rdx),%xmm0
11896320Sbholler	movdqa 0x30(%rdx),%xmm5
11906320Sbholler	lea    0x30(%rdx),%rdx
11916320Sbholler
11926320Sbholler	shufpd $0x1,%xmm3,%xmm1
11936320Sbholler	movdqa %xmm1,(%rcx)
11946320Sbholler
11956320Sbholler	cmp    $0x30,%r8
11966320Sbholler
11976320Sbholler	shufpd $0x1,%xmm0,%xmm3
11986320Sbholler	movdqa %xmm3,0x10(%rcx)
11996320Sbholler
12006320Sbholler	movdqa %xmm5,%xmm1
12016320Sbholler	shufpd $0x1,%xmm5,%xmm0
12026320Sbholler	movdqa %xmm0,0x20(%rcx)
12036320Sbholler
12046320Sbholler	lea    0x30(%rcx),%rcx
12056320Sbholler
12066320Sbholler	jge    L(movdqa8)
12076320Sbholler	jmp    L(movdqa_epi)
12086320Sbholler
12096320Sbholler	.balign 16
12106320SbhollerL(movdqa9):
12116320Sbholler	sub    $0x20,%r8
12126320Sbholler	movdqa 0x10(%rdx),%xmm3
12136320Sbholler	movdqa 0x20(%rdx),%xmm0
12146320Sbholler	add    $0x20,%rdx
12156320Sbholler
12166320Sbholler	psrldq $0x9,%xmm1
12176320Sbholler	movdqa %xmm3,%xmm2
12186320Sbholler	pslldq $0x7,%xmm3
12196320Sbholler	por    %xmm1,%xmm3
12206320Sbholler
12216320Sbholler	psrldq $0x9,%xmm2
12226320Sbholler	movdqa %xmm0,%xmm1
12236320Sbholler	pslldq $0x7,%xmm0
12246320Sbholler	por    %xmm2,%xmm0
12256320Sbholler	movdqa %xmm3,(%rcx)
12266320Sbholler	movdqa %xmm0,0x10(%rcx)
12276320Sbholler
12286320Sbholler	add    $0x20,%rcx
12296320Sbholler	cmp    $0x20,%r8
12306320Sbholler	jge    L(movdqa9)
12316320Sbholler	jmp    L(movdqa_epi)
12326320Sbholler
12336320Sbholler	.balign 16
12346320SbhollerL(movdqa10):
12356320Sbholler	sub    $0x20,%r8
12366320Sbholler	movdqa 0x10(%rdx),%xmm3
12376320Sbholler	movdqa 0x20(%rdx),%xmm0
12386320Sbholler	add    $0x20,%rdx
12396320Sbholler
12406320Sbholler	psrldq $0xa,%xmm1
12416320Sbholler	movdqa %xmm3,%xmm2
12426320Sbholler	pslldq $0x6,%xmm3
12436320Sbholler	por    %xmm1,%xmm3
12446320Sbholler
12456320Sbholler	psrldq $0xa,%xmm2
12466320Sbholler	movdqa %xmm0,%xmm1
12476320Sbholler	pslldq $0x6,%xmm0
12486320Sbholler	por    %xmm2,%xmm0
12496320Sbholler	movdqa %xmm3,(%rcx)
12506320Sbholler	movdqa %xmm0,0x10(%rcx)
12516320Sbholler
12526320Sbholler	add    $0x20,%rcx
12536320Sbholler	cmp    $0x20,%r8
12546320Sbholler	jge    L(movdqa10)
12556320Sbholler	jmp    L(movdqa_epi)
12566320Sbholler
12576320Sbholler	.balign 16
12586320SbhollerL(movdqa11):
12596320Sbholler	sub    $0x20,%r8
12606320Sbholler	movdqa 0x10(%rdx),%xmm3
12616320Sbholler	movdqa 0x20(%rdx),%xmm0
12626320Sbholler	add    $0x20,%rdx
12636320Sbholler
12646320Sbholler	psrldq $0xb,%xmm1
12656320Sbholler	movdqa %xmm3,%xmm2
12666320Sbholler	pslldq $0x5,%xmm3
12676320Sbholler	por    %xmm1,%xmm3
12686320Sbholler
12696320Sbholler	psrldq $0xb,%xmm2
12706320Sbholler	movdqa %xmm0,%xmm1
12716320Sbholler	pslldq $0x5,%xmm0
12726320Sbholler	por    %xmm2,%xmm0
12736320Sbholler	movdqa %xmm3,(%rcx)
12746320Sbholler	movdqa %xmm0,0x10(%rcx)
12756320Sbholler
12766320Sbholler	add    $0x20,%rcx
12776320Sbholler	cmp    $0x20,%r8
12786320Sbholler	jge    L(movdqa11)
12796320Sbholler	jmp    L(movdqa_epi)
12806320Sbholler
12816320Sbholler	.balign 16
12826320SbhollerL(movdqa12):
12836320Sbholler	sub    $0x20,%r8
12846320Sbholler	movdqa 0x10(%rdx),%xmm3
12856320Sbholler	movdqa 0x20(%rdx),%xmm0
12866320Sbholler	add    $0x20,%rdx
12876320Sbholler
12886320Sbholler	psrldq $0xc,%xmm1
12896320Sbholler	movdqa %xmm3,%xmm2
12906320Sbholler	pslldq $0x4,%xmm3
12916320Sbholler	por    %xmm1,%xmm3
12926320Sbholler
12936320Sbholler	psrldq $0xc,%xmm2
12946320Sbholler	movdqa %xmm0,%xmm1
12956320Sbholler	pslldq $0x4,%xmm0
12966320Sbholler	por    %xmm2,%xmm0
12976320Sbholler	movdqa %xmm3,(%rcx)
12986320Sbholler	movdqa %xmm0,0x10(%rcx)
12996320Sbholler
13006320Sbholler	add    $0x20,%rcx
13016320Sbholler	cmp    $0x20,%r8
13026320Sbholler	jge    L(movdqa12)
13036320Sbholler	jmp    L(movdqa_epi)
13046320Sbholler
13056320Sbholler	.balign 16
13066320SbhollerL(movdqa13):
13076320Sbholler	sub    $0x20,%r8
13086320Sbholler	movdqa 0x10(%rdx),%xmm3
13096320Sbholler	movdqa 0x20(%rdx),%xmm0
13106320Sbholler	add    $0x20,%rdx
13116320Sbholler
13126320Sbholler	psrldq $0xd,%xmm1
13136320Sbholler	movdqa %xmm3,%xmm2
13146320Sbholler	pslldq $0x3,%xmm3
13156320Sbholler	por    %xmm1,%xmm3
13166320Sbholler
13176320Sbholler	psrldq $0xd,%xmm2
13186320Sbholler	movdqa %xmm0,%xmm1
13196320Sbholler	pslldq $0x3,%xmm0
13206320Sbholler	por    %xmm2,%xmm0
13216320Sbholler	movdqa %xmm3,(%rcx)
13226320Sbholler	movdqa %xmm0,0x10(%rcx)
13236320Sbholler
13246320Sbholler	add    $0x20,%rcx
13256320Sbholler	cmp    $0x20,%r8
13266320Sbholler	jge    L(movdqa13)
13276320Sbholler	jmp    L(movdqa_epi)
13286320Sbholler
13296320Sbholler	.balign 16
13306320SbhollerL(movdqa14):
13316320Sbholler	sub    $0x20,%r8
13326320Sbholler	movdqa 0x10(%rdx),%xmm3
13336320Sbholler	movdqa 0x20(%rdx),%xmm0
13346320Sbholler	add    $0x20,%rdx
13356320Sbholler
13366320Sbholler	psrldq $0xe,%xmm1
13376320Sbholler	movdqa %xmm3,%xmm2
13386320Sbholler	pslldq $0x2,%xmm3
13396320Sbholler	por    %xmm1,%xmm3
13406320Sbholler
13416320Sbholler	psrldq $0xe,%xmm2
13426320Sbholler	movdqa %xmm0,%xmm1
13436320Sbholler	pslldq $0x2,%xmm0
13446320Sbholler	por    %xmm2,%xmm0
13456320Sbholler	movdqa %xmm3,(%rcx)
13466320Sbholler	movdqa %xmm0,0x10(%rcx)
13476320Sbholler
13486320Sbholler	add    $0x20,%rcx
13496320Sbholler	cmp    $0x20,%r8
13506320Sbholler	jge    L(movdqa14)
13516320Sbholler	jmp    L(movdqa_epi)
13526320Sbholler
13536320Sbholler	.balign 16
13546320SbhollerL(movdqa15):
13556320Sbholler	sub    $0x20,%r8
13566320Sbholler	movdqa 0x10(%rdx),%xmm3
13576320Sbholler	movdqa 0x20(%rdx),%xmm0
13586320Sbholler	add    $0x20,%rdx
13596320Sbholler
13606320Sbholler	psrldq $0xf,%xmm1
13616320Sbholler	movdqa %xmm3,%xmm2
13626320Sbholler	pslldq $0x1,%xmm3
13636320Sbholler	por    %xmm1,%xmm3
13646320Sbholler
13656320Sbholler	psrldq $0xf,%xmm2
13666320Sbholler	movdqa %xmm0,%xmm1
13676320Sbholler	pslldq $0x1,%xmm0
13686320Sbholler	por    %xmm2,%xmm0
13696320Sbholler	movdqa %xmm3,(%rcx)
13706320Sbholler	movdqa %xmm0,0x10(%rcx)
13716320Sbholler
13726320Sbholler	add    $0x20,%rcx
13736320Sbholler	cmp    $0x20,%r8
13746320Sbholler	jge    L(movdqa15)
13756320Sbholler	#jmp   L(movdqa_epi)
13766320Sbholler
13776320Sbholler	.balign 16
13786320SbhollerL(movdqa_epi):
13796320Sbholler	lea    L(fwdPxQx)(%rip),%r10
13806320Sbholler	add    %r11,%rdx # bump rdx to the right addr (it lagged behind in the above loop)
13816320Sbholler	add    %r8,%rcx
13826320Sbholler	add    %r8,%rdx
13836320Sbholler
13846320Sbholler	movslq (%r10,%r8,4),%r9
13856320Sbholler	lea    (%r9,%r10,1),%r10
13866320Sbholler	jmpq   *%r10
13876320Sbholler
13886320Sbholler	.balign 16
13896320SbhollerL(mov3dqa1):
13906320Sbholler	movdqa	0x10(%rdx),%xmm3 # load the upper source buffer
13916320Sbholler	sub	$0x30,%r8
13926320Sbholler	movdqa	0x20(%rdx),%xmm0 # load the upper source buffer
13936320Sbholler	movdqa	0x30(%rdx),%xmm5 # load the upper source buffer
13946320Sbholler	lea	0x30(%rdx),%rdx
13956320Sbholler	cmp	$0x30,%r8
13966320Sbholler
13976320Sbholler	movdqa	%xmm3,%xmm2       # store off xmm reg for use next iteration
13986320Sbholler	#palignr	$0x1,%xmm1,%xmm3
13996320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
14006320Sbholler	.byte	0xd9,0x01
14016320Sbholler	movdqa	%xmm3,(%rcx)      # store it
14026320Sbholler
14036320Sbholler	movdqa	%xmm0,%xmm4       # store off xmm reg for use next iteration
14046320Sbholler	#palignr	$0x1,%xmm2,%xmm0
14056320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
14066320Sbholler	.byte	0xc2,0x01
14076320Sbholler	movdqa	%xmm0,0x10(%rcx)  # store it
14086320Sbholler
14096320Sbholler	movdqa	%xmm5,%xmm1       # store off xmm reg for use next iteration
14106320Sbholler	#palignr	$0x1,%xmm4,%xmm5
14116320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
14126320Sbholler	.byte	0xec,0x01
14136320Sbholler	movdqa	%xmm5,0x20(%rcx)  # store it
14146320Sbholler
14156320Sbholler	lea	0x30(%rcx),%rcx
14166320Sbholler	jge	L(mov3dqa1)
14176320Sbholler
14186320Sbholler	cmp	$0x10,%r8
14196320Sbholler	jl	L(movdqa_epi)
14206320Sbholler	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
14216320Sbholler	sub	$0x10,%r8
14226320Sbholler	lea	0x10(%rdx),%rdx
14236320Sbholler	movdqa	%xmm3,%xmm2		# save for use next concat
14246320Sbholler	#palignr	$0x1,%xmm1,%xmm3
14256320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
14266320Sbholler	.byte	0xd9,0x01
14276320Sbholler
14286320Sbholler	cmp	$0x10,%r8
14296320Sbholler	movdqa	%xmm3,(%rcx)      	# store it
14306320Sbholler	lea	0x10(%rcx),%rcx
14316320Sbholler	jl	L(movdqa_epi)
14326320Sbholler
14336320Sbholler	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
14346320Sbholler	sub	$0x10,%r8
14356320Sbholler	lea	0x10(%rdx),%rdx
14366320Sbholler	#palignr	$0x1,%xmm2,%xmm0
14376320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
14386320Sbholler	.byte	0xc2,0x01
14396320Sbholler	movdqa	%xmm0,(%rcx)      	# store it
14406320Sbholler	lea	0x10(%rcx),%rcx
14416320Sbholler	jmp	L(movdqa_epi)
14426320Sbholler
14436320Sbholler	.balign 16
14446320SbhollerL(mov3dqa2):
14456320Sbholler	movdqa	0x10(%rdx),%xmm3
14466320Sbholler	sub	$0x30,%r8
14476320Sbholler	movdqa	0x20(%rdx),%xmm0
14486320Sbholler	movdqa	0x30(%rdx),%xmm5
14496320Sbholler	lea	0x30(%rdx),%rdx
14506320Sbholler	cmp	$0x30,%r8
14516320Sbholler
14526320Sbholler	movdqa	%xmm3,%xmm2
14536320Sbholler	#palignr	$0x2,%xmm1,%xmm3
14546320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
14556320Sbholler	.byte	0xd9,0x02
14566320Sbholler	movdqa	%xmm3,(%rcx)
14576320Sbholler
14586320Sbholler	movdqa	%xmm0,%xmm4
14596320Sbholler	#palignr	$0x2,%xmm2,%xmm0
14606320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
14616320Sbholler	.byte	0xc2,0x02
14626320Sbholler	movdqa	%xmm0,0x10(%rcx)
14636320Sbholler
14646320Sbholler	movdqa	%xmm5,%xmm1
14656320Sbholler	#palignr	$0x2,%xmm4,%xmm5
14666320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
14676320Sbholler	.byte	0xec,0x02
14686320Sbholler	movdqa	%xmm5,0x20(%rcx)
14696320Sbholler
14706320Sbholler	lea	0x30(%rcx),%rcx
14716320Sbholler	jge	L(mov3dqa2)
14726320Sbholler
14736320Sbholler	cmp	$0x10,%r8
14746320Sbholler	jl	L(movdqa_epi)
14756320Sbholler	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
14766320Sbholler	sub	$0x10,%r8
14776320Sbholler	lea	0x10(%rdx),%rdx
14786320Sbholler	movdqa	%xmm3,%xmm2		# save for use next concat
14796320Sbholler	#palignr	$0x2,%xmm1,%xmm3
14806320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
14816320Sbholler	.byte	0xd9,0x02
14826320Sbholler
14836320Sbholler	cmp	$0x10,%r8
14846320Sbholler	movdqa	%xmm3,(%rcx)      	# store it
14856320Sbholler	lea	0x10(%rcx),%rcx
14866320Sbholler	jl	L(movdqa_epi)
14876320Sbholler
14886320Sbholler	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
14896320Sbholler	sub	$0x10,%r8
14906320Sbholler	lea	0x10(%rdx),%rdx
14916320Sbholler	#palignr	$0x2,%xmm2,%xmm0
14926320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
14936320Sbholler	.byte	0xc2,0x02
14946320Sbholler	movdqa	%xmm0,(%rcx)      	# store it
14956320Sbholler	lea	0x10(%rcx),%rcx
14966320Sbholler	jmp	L(movdqa_epi)
14976320Sbholler
14986320Sbholler	.balign 16
14996320SbhollerL(mov3dqa3):
15006320Sbholler	movdqa	0x10(%rdx),%xmm3
15016320Sbholler	sub	$0x30,%r8
15026320Sbholler	movdqa	0x20(%rdx),%xmm0
15036320Sbholler	movdqa	0x30(%rdx),%xmm5
15046320Sbholler	lea	0x30(%rdx),%rdx
15056320Sbholler	cmp	$0x30,%r8
15066320Sbholler
15076320Sbholler	movdqa	%xmm3,%xmm2
15086320Sbholler	#palignr	$0x3,%xmm1,%xmm3
15096320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
15106320Sbholler	.byte	0xd9,0x03
15116320Sbholler	movdqa	%xmm3,(%rcx)
15126320Sbholler
15136320Sbholler	movdqa	%xmm0,%xmm4
15146320Sbholler	#palignr	$0x3,%xmm2,%xmm0
15156320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
15166320Sbholler	.byte	0xc2,0x03
15176320Sbholler	movdqa	%xmm0,0x10(%rcx)
15186320Sbholler
15196320Sbholler	movdqa	%xmm5,%xmm1
15206320Sbholler	#palignr	$0x3,%xmm4,%xmm5
15216320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
15226320Sbholler	.byte	0xec,0x03
15236320Sbholler	movdqa	%xmm5,0x20(%rcx)
15246320Sbholler
15256320Sbholler	lea	0x30(%rcx),%rcx
15266320Sbholler	jge	L(mov3dqa3)
15276320Sbholler
15286320Sbholler	cmp	$0x10,%r8
15296320Sbholler	jl	L(movdqa_epi)
15306320Sbholler	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
15316320Sbholler	sub	$0x10,%r8
15326320Sbholler	lea	0x10(%rdx),%rdx
15336320Sbholler	movdqa	%xmm3,%xmm2		# save for use next concat
15346320Sbholler	#palignr	$0x3,%xmm1,%xmm3
15356320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
15366320Sbholler	.byte	0xd9,0x03
15376320Sbholler
15386320Sbholler	cmp	$0x10,%r8
15396320Sbholler	movdqa	%xmm3,(%rcx)      	# store it
15406320Sbholler	lea	0x10(%rcx),%rcx
15416320Sbholler	jl	L(movdqa_epi)
15426320Sbholler
15436320Sbholler	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
15446320Sbholler	sub	$0x10,%r8
15456320Sbholler	lea	0x10(%rdx),%rdx
15466320Sbholler	#palignr	$0x3,%xmm2,%xmm0
15476320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
15486320Sbholler	.byte	0xc2,0x03
15496320Sbholler	movdqa	%xmm0,(%rcx)      	# store it
15506320Sbholler	lea	0x10(%rcx),%rcx
15516320Sbholler	jmp	L(movdqa_epi)
15526320Sbholler
15536320Sbholler	.balign 16
15546320SbhollerL(mov3dqa4):
15556320Sbholler	movdqa	0x10(%rdx),%xmm3
15566320Sbholler	sub	$0x30,%r8
15576320Sbholler	movdqa	0x20(%rdx),%xmm0
15586320Sbholler	movdqa	0x30(%rdx),%xmm5
15596320Sbholler	lea	0x30(%rdx),%rdx
15606320Sbholler	cmp	$0x30,%r8
15616320Sbholler
15626320Sbholler	movdqa	%xmm3,%xmm2
15636320Sbholler	#palignr	$0x4,%xmm1,%xmm3
15646320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
15656320Sbholler	.byte	0xd9,0x04
15666320Sbholler	movdqa	%xmm3,(%rcx)
15676320Sbholler
15686320Sbholler	movdqa	%xmm0,%xmm4
15696320Sbholler	#palignr	$0x4,%xmm2,%xmm0
15706320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
15716320Sbholler	.byte	0xc2,0x04
15726320Sbholler	movdqa	%xmm0,0x10(%rcx)
15736320Sbholler
15746320Sbholler	movdqa	%xmm5,%xmm1
15756320Sbholler	#palignr	$0x4,%xmm4,%xmm5
15766320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
15776320Sbholler	.byte	0xec,0x04
15786320Sbholler	movdqa	%xmm5,0x20(%rcx)
15796320Sbholler
15806320Sbholler	lea	0x30(%rcx),%rcx
15816320Sbholler	jge	L(mov3dqa4)
15826320Sbholler
15836320Sbholler	cmp	$0x10,%r8
15846320Sbholler	jl	L(movdqa_epi)
15856320Sbholler	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
15866320Sbholler	sub	$0x10,%r8
15876320Sbholler	lea	0x10(%rdx),%rdx
15886320Sbholler	movdqa	%xmm3,%xmm2		# save for use next concat
15896320Sbholler	#palignr	$0x4,%xmm1,%xmm3
15906320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
15916320Sbholler	.byte	0xd9,0x04
15926320Sbholler
15936320Sbholler	cmp	$0x10,%r8
15946320Sbholler	movdqa	%xmm3,(%rcx)      	# store it
15956320Sbholler	lea	0x10(%rcx),%rcx
15966320Sbholler	jl	L(movdqa_epi)
15976320Sbholler
15986320Sbholler	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
15996320Sbholler	sub	$0x10,%r8
16006320Sbholler	lea	0x10(%rdx),%rdx
16016320Sbholler	#palignr	$0x4,%xmm2,%xmm0
16026320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
16036320Sbholler	.byte	0xc2,0x04
16046320Sbholler	movdqa	%xmm0,(%rcx)      	# store it
16056320Sbholler	lea	0x10(%rcx),%rcx
16066320Sbholler	jmp	L(movdqa_epi)
16076320Sbholler
16086320Sbholler	.balign 16
16096320SbhollerL(mov3dqa5):
16106320Sbholler	movdqa	0x10(%rdx),%xmm3
16116320Sbholler	sub	$0x30,%r8
16126320Sbholler	movdqa	0x20(%rdx),%xmm0
16136320Sbholler	movdqa	0x30(%rdx),%xmm5
16146320Sbholler	lea	0x30(%rdx),%rdx
16156320Sbholler	cmp	$0x30,%r8
16166320Sbholler
16176320Sbholler	movdqa	%xmm3,%xmm2
16186320Sbholler	#palignr	$0x5,%xmm1,%xmm3
16196320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
16206320Sbholler	.byte	0xd9,0x05
16216320Sbholler	movdqa	%xmm3,(%rcx)
16226320Sbholler
16236320Sbholler	movdqa	%xmm0,%xmm4
16246320Sbholler	#palignr	$0x5,%xmm2,%xmm0
16256320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
16266320Sbholler	.byte	0xc2,0x05
16276320Sbholler	movdqa	%xmm0,0x10(%rcx)
16286320Sbholler
16296320Sbholler	movdqa	%xmm5,%xmm1
16306320Sbholler	#palignr	$0x5,%xmm4,%xmm5
16316320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
16326320Sbholler	.byte	0xec,0x05
16336320Sbholler	movdqa	%xmm5,0x20(%rcx)
16346320Sbholler
16356320Sbholler	lea	0x30(%rcx),%rcx
16366320Sbholler	jge	L(mov3dqa5)
16376320Sbholler
16386320Sbholler	cmp	$0x10,%r8
16396320Sbholler	jl	L(movdqa_epi)
16406320Sbholler	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
16416320Sbholler	sub	$0x10,%r8
16426320Sbholler	lea	0x10(%rdx),%rdx
16436320Sbholler	movdqa	%xmm3,%xmm2		# save for use next concat
16446320Sbholler	#palignr	$0x5,%xmm1,%xmm3
16456320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
16466320Sbholler	.byte	0xd9,0x05
16476320Sbholler
16486320Sbholler	cmp	$0x10,%r8
16496320Sbholler	movdqa	%xmm3,(%rcx)      	# store it
16506320Sbholler	lea	0x10(%rcx),%rcx
16516320Sbholler	jl	L(movdqa_epi)
16526320Sbholler
16536320Sbholler	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
16546320Sbholler	sub	$0x10,%r8
16556320Sbholler	lea	0x10(%rdx),%rdx
16566320Sbholler	#palignr	$0x5,%xmm2,%xmm0
16576320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
16586320Sbholler	.byte	0xc2,0x05
16596320Sbholler	movdqa	%xmm0,(%rcx)      	# store it
16606320Sbholler	lea	0x10(%rcx),%rcx
16616320Sbholler	jmp	L(movdqa_epi)
16626320Sbholler
16636320Sbholler	.balign 16
16646320SbhollerL(mov3dqa6):
16656320Sbholler	movdqa	0x10(%rdx),%xmm3
16666320Sbholler	sub	$0x30,%r8
16676320Sbholler	movdqa	0x20(%rdx),%xmm0
16686320Sbholler	movdqa	0x30(%rdx),%xmm5
16696320Sbholler	lea	0x30(%rdx),%rdx
16706320Sbholler	cmp	$0x30,%r8
16716320Sbholler
16726320Sbholler	movdqa	%xmm3,%xmm2
16736320Sbholler	#palignr	$0x6,%xmm1,%xmm3
16746320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
16756320Sbholler	.byte	0xd9,0x06
16766320Sbholler	movdqa	%xmm3,(%rcx)
16776320Sbholler
16786320Sbholler	movdqa	%xmm0,%xmm4
16796320Sbholler	#palignr	$0x6,%xmm2,%xmm0
16806320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
16816320Sbholler	.byte	0xc2,0x06
16826320Sbholler	movdqa	%xmm0,0x10(%rcx)
16836320Sbholler
16846320Sbholler	movdqa	%xmm5,%xmm1
16856320Sbholler	#palignr	$0x6,%xmm4,%xmm5
16866320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
16876320Sbholler	.byte	0xec,0x06
16886320Sbholler	movdqa	%xmm5,0x20(%rcx)
16896320Sbholler
16906320Sbholler	lea	0x30(%rcx),%rcx
16916320Sbholler	jge	L(mov3dqa6)
16926320Sbholler
16936320Sbholler	cmp	$0x10,%r8
16946320Sbholler	jl	L(movdqa_epi)
16956320Sbholler	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
16966320Sbholler	sub	$0x10,%r8
16976320Sbholler	lea	0x10(%rdx),%rdx
16986320Sbholler	movdqa	%xmm3,%xmm2		# save for use next concat
16996320Sbholler	#palignr	$0x6,%xmm1,%xmm3
17006320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
17016320Sbholler	.byte	0xd9,0x06
17026320Sbholler
17036320Sbholler	cmp	$0x10,%r8
17046320Sbholler	movdqa	%xmm3,(%rcx)      	# store it
17056320Sbholler	lea	0x10(%rcx),%rcx
17066320Sbholler	jl	L(movdqa_epi)
17076320Sbholler
17086320Sbholler	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
17096320Sbholler	sub	$0x10,%r8
17106320Sbholler	lea	0x10(%rdx),%rdx
17116320Sbholler	#palignr	$0x6,%xmm2,%xmm0
17126320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
17136320Sbholler	.byte	0xc2,0x06
17146320Sbholler	movdqa	%xmm0,(%rcx)      	# store it
17156320Sbholler	lea	0x10(%rcx),%rcx
17166320Sbholler	jmp	L(movdqa_epi)
17176320Sbholler
17186320Sbholler	.balign 16
17196320SbhollerL(mov3dqa7):
17206320Sbholler	movdqa	0x10(%rdx),%xmm3
17216320Sbholler	sub	$0x30,%r8
17226320Sbholler	movdqa	0x20(%rdx),%xmm0
17236320Sbholler	movdqa	0x30(%rdx),%xmm5
17246320Sbholler	lea	0x30(%rdx),%rdx
17256320Sbholler	cmp	$0x30,%r8
17266320Sbholler
17276320Sbholler	movdqa	%xmm3,%xmm2
17286320Sbholler	#palignr	$0x7,%xmm1,%xmm3
17296320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
17306320Sbholler	.byte	0xd9,0x07
17316320Sbholler	movdqa	%xmm3,(%rcx)
17326320Sbholler
17336320Sbholler	movdqa	%xmm0,%xmm4
17346320Sbholler	#palignr	$0x7,%xmm2,%xmm0
17356320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
17366320Sbholler	.byte	0xc2,0x07
17376320Sbholler	movdqa	%xmm0,0x10(%rcx)
17386320Sbholler
17396320Sbholler	movdqa	%xmm5,%xmm1
17406320Sbholler	#palignr	$0x7,%xmm4,%xmm5
17416320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
17426320Sbholler	.byte	0xec,0x07
17436320Sbholler	movdqa	%xmm5,0x20(%rcx)
17446320Sbholler
17456320Sbholler	lea	0x30(%rcx),%rcx
17466320Sbholler	jge	L(mov3dqa7)
17476320Sbholler
17486320Sbholler	cmp	$0x10,%r8
17496320Sbholler	jl	L(movdqa_epi)
17506320Sbholler	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
17516320Sbholler	sub	$0x10,%r8
17526320Sbholler	lea	0x10(%rdx),%rdx
17536320Sbholler	movdqa	%xmm3,%xmm2		# save for use next concat
17546320Sbholler	#palignr	$0x7,%xmm1,%xmm3
17556320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
17566320Sbholler	.byte	0xd9,0x07
17576320Sbholler
17586320Sbholler	cmp	$0x10,%r8
17596320Sbholler	movdqa	%xmm3,(%rcx)      	# store it
17606320Sbholler	lea	0x10(%rcx),%rcx
17616320Sbholler	jl	L(movdqa_epi)
17626320Sbholler
17636320Sbholler	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
17646320Sbholler	sub	$0x10,%r8
17656320Sbholler	lea	0x10(%rdx),%rdx
17666320Sbholler	#palignr	$0x7,%xmm2,%xmm0
17676320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
17686320Sbholler	.byte	0xc2,0x07
17696320Sbholler	movdqa	%xmm0,(%rcx)      	# store it
17706320Sbholler	lea	0x10(%rcx),%rcx
17716320Sbholler	jmp	L(movdqa_epi)
17726320Sbholler
17736320Sbholler	.balign 16
17746320SbhollerL(mov3dqa9):
17756320Sbholler	movdqa	0x10(%rdx),%xmm3
17766320Sbholler	sub	$0x30,%r8
17776320Sbholler	movdqa	0x20(%rdx),%xmm0
17786320Sbholler	movdqa	0x30(%rdx),%xmm5
17796320Sbholler	lea	0x30(%rdx),%rdx
17806320Sbholler	cmp	$0x30,%r8
17816320Sbholler
17826320Sbholler	movdqa	%xmm3,%xmm2
17836320Sbholler	#palignr	$0x9,%xmm1,%xmm3
17846320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
17856320Sbholler	.byte	0xd9,0x09
17866320Sbholler	movdqa	%xmm3,(%rcx)
17876320Sbholler
17886320Sbholler	movdqa	%xmm0,%xmm4
17896320Sbholler	#palignr	$0x9,%xmm2,%xmm0
17906320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
17916320Sbholler	.byte	0xc2,0x09
17926320Sbholler	movdqa	%xmm0,0x10(%rcx)
17936320Sbholler
17946320Sbholler	movdqa	%xmm5,%xmm1
17956320Sbholler	#palignr	$0x9,%xmm4,%xmm5
17966320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
17976320Sbholler	.byte	0xec,0x09
17986320Sbholler	movdqa	%xmm5,0x20(%rcx)
17996320Sbholler
18006320Sbholler	lea	0x30(%rcx),%rcx
18016320Sbholler	jge	L(mov3dqa9)
18026320Sbholler
18036320Sbholler	cmp	$0x10,%r8
18046320Sbholler	jl	L(movdqa_epi)
18056320Sbholler	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
18066320Sbholler	sub	$0x10,%r8
18076320Sbholler	lea	0x10(%rdx),%rdx
18086320Sbholler	movdqa	%xmm3,%xmm2		# save for use next concat
18096320Sbholler	#palignr	$0x9,%xmm1,%xmm3
18106320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
18116320Sbholler	.byte	0xd9,0x09
18126320Sbholler
18136320Sbholler	cmp	$0x10,%r8
18146320Sbholler	movdqa	%xmm3,(%rcx)      	# store it
18156320Sbholler	lea	0x10(%rcx),%rcx
18166320Sbholler	jl	L(movdqa_epi)
18176320Sbholler
18186320Sbholler	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
18196320Sbholler	sub	$0x10,%r8
18206320Sbholler	lea	0x10(%rdx),%rdx
18216320Sbholler	#palignr	$0x9,%xmm2,%xmm0
18226320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
18236320Sbholler	.byte	0xc2,0x09
18246320Sbholler	movdqa	%xmm0,(%rcx)      	# store it
18256320Sbholler	lea	0x10(%rcx),%rcx
18266320Sbholler	jmp	L(movdqa_epi)
18276320Sbholler
18286320Sbholler	.balign 16
18296320SbhollerL(mov3dqa10):
18306320Sbholler	movdqa	0x10(%rdx),%xmm3
18316320Sbholler	sub	$0x30,%r8
18326320Sbholler	movdqa	0x20(%rdx),%xmm0
18336320Sbholler	movdqa	0x30(%rdx),%xmm5
18346320Sbholler	lea	0x30(%rdx),%rdx
18356320Sbholler	cmp	$0x30,%r8
18366320Sbholler
18376320Sbholler	movdqa	%xmm3,%xmm2
18386320Sbholler	#palignr	$0xa,%xmm1,%xmm3
18396320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
18406320Sbholler	.byte	0xd9,0x0a
18416320Sbholler	movdqa	%xmm3,(%rcx)
18426320Sbholler
18436320Sbholler	movdqa	%xmm0,%xmm4
18446320Sbholler	#palignr	$0xa,%xmm2,%xmm0
18456320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
18466320Sbholler	.byte	0xc2,0x0a
18476320Sbholler	movdqa	%xmm0,0x10(%rcx)
18486320Sbholler
18496320Sbholler	movdqa	%xmm5,%xmm1
18506320Sbholler	#palignr	$0xa,%xmm4,%xmm5
18516320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
18526320Sbholler	.byte	0xec,0x0a
18536320Sbholler	movdqa	%xmm5,0x20(%rcx)
18546320Sbholler
18556320Sbholler	lea	0x30(%rcx),%rcx
18566320Sbholler	jge	L(mov3dqa10)
18576320Sbholler
18586320Sbholler	cmp	$0x10,%r8
18596320Sbholler	jl	L(movdqa_epi)
18606320Sbholler	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
18616320Sbholler	sub	$0x10,%r8
18626320Sbholler	lea	0x10(%rdx),%rdx
18636320Sbholler	movdqa	%xmm3,%xmm2		# save for use next concat
18646320Sbholler	#palignr	$0xa,%xmm1,%xmm3
18656320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
18666320Sbholler	.byte	0xd9,0x0a
18676320Sbholler
18686320Sbholler	cmp	$0x10,%r8
18696320Sbholler	movdqa	%xmm3,(%rcx)      	# store it
18706320Sbholler	lea	0x10(%rcx),%rcx
18716320Sbholler	jl	L(movdqa_epi)
18726320Sbholler
18736320Sbholler	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
18746320Sbholler	sub	$0x10,%r8
18756320Sbholler	lea	0x10(%rdx),%rdx
18766320Sbholler	#palignr	$0xa,%xmm2,%xmm0
18776320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
18786320Sbholler	.byte	0xc2,0x0a
18796320Sbholler	movdqa	%xmm0,(%rcx)      	# store it
18806320Sbholler	lea	0x10(%rcx),%rcx
18816320Sbholler	jmp	L(movdqa_epi)
18826320Sbholler
18836320Sbholler	.balign 16
18846320SbhollerL(mov3dqa11):
18856320Sbholler	movdqa	0x10(%rdx),%xmm3
18866320Sbholler	sub	$0x30,%r8
18876320Sbholler	movdqa	0x20(%rdx),%xmm0
18886320Sbholler	movdqa	0x30(%rdx),%xmm5
18896320Sbholler	lea	0x30(%rdx),%rdx
18906320Sbholler	cmp	$0x30,%r8
18916320Sbholler
18926320Sbholler	movdqa	%xmm3,%xmm2
18936320Sbholler	#palignr	$0xb,%xmm1,%xmm3
18946320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
18956320Sbholler	.byte	0xd9,0x0b
18966320Sbholler	movdqa	%xmm3,(%rcx)
18976320Sbholler
18986320Sbholler	movdqa	%xmm0,%xmm4
18996320Sbholler	#palignr	$0xb,%xmm2,%xmm0
19006320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
19016320Sbholler	.byte	0xc2,0x0b
19026320Sbholler	movdqa	%xmm0,0x10(%rcx)
19036320Sbholler
19046320Sbholler	movdqa	%xmm5,%xmm1
19056320Sbholler	#palignr	$0xb,%xmm4,%xmm5
19066320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
19076320Sbholler	.byte	0xec,0x0b
19086320Sbholler	movdqa	%xmm5,0x20(%rcx)
19096320Sbholler
19106320Sbholler	lea	0x30(%rcx),%rcx
19116320Sbholler	jge	L(mov3dqa11)
19126320Sbholler
19136320Sbholler	cmp	$0x10,%r8
19146320Sbholler	jl	L(movdqa_epi)
19156320Sbholler	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
19166320Sbholler	sub	$0x10,%r8
19176320Sbholler	lea	0x10(%rdx),%rdx
19186320Sbholler	movdqa	%xmm3,%xmm2		# save for use next concat
19196320Sbholler	#palignr	$0xb,%xmm1,%xmm3
19206320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
19216320Sbholler	.byte	0xd9,0x0b
19226320Sbholler
19236320Sbholler	cmp	$0x10,%r8
19246320Sbholler	movdqa	%xmm3,(%rcx)      	# store it
19256320Sbholler	lea	0x10(%rcx),%rcx
19266320Sbholler	jl	L(movdqa_epi)
19276320Sbholler
19286320Sbholler	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
19296320Sbholler	sub	$0x10,%r8
19306320Sbholler	lea	0x10(%rdx),%rdx
19316320Sbholler	#palignr	$0xb,%xmm2,%xmm0
19326320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
19336320Sbholler	.byte	0xc2,0x0b
19346320Sbholler	movdqa	%xmm0,(%rcx)      	# store it
19356320Sbholler	lea	0x10(%rcx),%rcx
19366320Sbholler	jmp	L(movdqa_epi)
19376320Sbholler
19386320Sbholler	.balign 16
19396320SbhollerL(mov3dqa12):
19406320Sbholler	movdqa	0x10(%rdx),%xmm3
19416320Sbholler	sub	$0x30,%r8
19426320Sbholler	movdqa	0x20(%rdx),%xmm0
19436320Sbholler	movdqa	0x30(%rdx),%xmm5
19446320Sbholler	lea	0x30(%rdx),%rdx
19456320Sbholler	cmp	$0x30,%r8
19466320Sbholler
19476320Sbholler	movdqa	%xmm3,%xmm2
19486320Sbholler	#palignr	$0xc,%xmm1,%xmm3
19496320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
19506320Sbholler	.byte	0xd9,0x0c
19516320Sbholler	movdqa	%xmm3,(%rcx)
19526320Sbholler
19536320Sbholler	movdqa	%xmm0,%xmm4
19546320Sbholler	#palignr	$0xc,%xmm2,%xmm0
19556320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
19566320Sbholler	.byte	0xc2,0x0c
19576320Sbholler	movdqa	%xmm0,0x10(%rcx)
19586320Sbholler
19596320Sbholler	movdqa	%xmm5,%xmm1
19606320Sbholler	#palignr	$0xc,%xmm4,%xmm5
19616320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
19626320Sbholler	.byte	0xec,0x0c
19636320Sbholler	movdqa	%xmm5,0x20(%rcx)
19646320Sbholler
19656320Sbholler	lea	0x30(%rcx),%rcx
19666320Sbholler	jge	L(mov3dqa12)
19676320Sbholler
19686320Sbholler	cmp	$0x10,%r8
19696320Sbholler	jl	L(movdqa_epi)
19706320Sbholler	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
19716320Sbholler	sub	$0x10,%r8
19726320Sbholler	lea	0x10(%rdx),%rdx
19736320Sbholler	movdqa	%xmm3,%xmm2		# save for use next concat
19746320Sbholler	#palignr	$0xc,%xmm1,%xmm3
19756320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
19766320Sbholler	.byte	0xd9,0x0c
19776320Sbholler
19786320Sbholler	cmp	$0x10,%r8
19796320Sbholler	movdqa	%xmm3,(%rcx)      	# store it
19806320Sbholler	lea	0x10(%rcx),%rcx
19816320Sbholler	jl	L(movdqa_epi)
19826320Sbholler
19836320Sbholler	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
19846320Sbholler	sub	$0x10,%r8
19856320Sbholler	lea	0x10(%rdx),%rdx
19866320Sbholler	#palignr	$0xc,%xmm2,%xmm0
19876320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
19886320Sbholler	.byte	0xc2,0x0c
19896320Sbholler	movdqa	%xmm0,(%rcx)      	# store it
19906320Sbholler	lea	0x10(%rcx),%rcx
19916320Sbholler	jmp	L(movdqa_epi)
19926320Sbholler
19936320Sbholler	.balign 16
19946320SbhollerL(mov3dqa13):
19956320Sbholler	movdqa	0x10(%rdx),%xmm3
19966320Sbholler	sub	$0x30,%r8
19976320Sbholler	movdqa	0x20(%rdx),%xmm0
19986320Sbholler	movdqa	0x30(%rdx),%xmm5
19996320Sbholler	lea	0x30(%rdx),%rdx
20006320Sbholler	cmp	$0x30,%r8
20016320Sbholler
20026320Sbholler	movdqa	%xmm3,%xmm2
20036320Sbholler	#palignr	$0xd,%xmm1,%xmm3
20046320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
20056320Sbholler	.byte	0xd9,0x0d
20066320Sbholler	movdqa	%xmm3,(%rcx)
20076320Sbholler
20086320Sbholler	movdqa	%xmm0,%xmm4
20096320Sbholler	#palignr	$0xd,%xmm2,%xmm0
20106320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
20116320Sbholler	.byte	0xc2,0x0d
20126320Sbholler	movdqa	%xmm0,0x10(%rcx)
20136320Sbholler
20146320Sbholler	movdqa	%xmm5,%xmm1
20156320Sbholler	#palignr	$0xd,%xmm4,%xmm5
20166320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
20176320Sbholler	.byte	0xec,0x0d
20186320Sbholler	movdqa	%xmm5,0x20(%rcx)
20196320Sbholler
20206320Sbholler	lea	0x30(%rcx),%rcx
20216320Sbholler	jge	L(mov3dqa13)
20226320Sbholler
20236320Sbholler	cmp	$0x10,%r8
20246320Sbholler	jl	L(movdqa_epi)
20256320Sbholler	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
20266320Sbholler	sub	$0x10,%r8
20276320Sbholler	lea	0x10(%rdx),%rdx
20286320Sbholler	movdqa	%xmm3,%xmm2		# save for use next concat
20296320Sbholler	#palignr	$0xd,%xmm1,%xmm3
20306320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
20316320Sbholler	.byte	0xd9,0x0d
20326320Sbholler
20336320Sbholler	cmp	$0x10,%r8
20346320Sbholler	movdqa	%xmm3,(%rcx)      	# store it
20356320Sbholler	lea	0x10(%rcx),%rcx
20366320Sbholler	jl	L(movdqa_epi)
20376320Sbholler
20386320Sbholler	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
20396320Sbholler	sub	$0x10,%r8
20406320Sbholler	lea	0x10(%rdx),%rdx
20416320Sbholler	#palignr	$0xd,%xmm2,%xmm0
20426320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
20436320Sbholler	.byte	0xc2,0x0d
20446320Sbholler	movdqa	%xmm0,(%rcx)      	# store it
20456320Sbholler	lea	0x10(%rcx),%rcx
20466320Sbholler	jmp	L(movdqa_epi)
20476320Sbholler
20486320Sbholler	.balign 16
20496320SbhollerL(mov3dqa14):
20506320Sbholler	movdqa	0x10(%rdx),%xmm3
20516320Sbholler	sub	$0x30,%r8
20526320Sbholler	movdqa	0x20(%rdx),%xmm0
20536320Sbholler	movdqa	0x30(%rdx),%xmm5
20546320Sbholler	lea	0x30(%rdx),%rdx
20556320Sbholler	cmp	$0x30,%r8
20566320Sbholler
20576320Sbholler	movdqa	%xmm3,%xmm2
20586320Sbholler	#palignr	$0xe,%xmm1,%xmm3
20596320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
20606320Sbholler	.byte	0xd9,0x0e
20616320Sbholler	movdqa	%xmm3,(%rcx)
20626320Sbholler
20636320Sbholler	movdqa	%xmm0,%xmm4
20646320Sbholler	#palignr	$0xe,%xmm2,%xmm0
20656320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
20666320Sbholler	.byte	0xc2,0x0e
20676320Sbholler	movdqa	%xmm0,0x10(%rcx)
20686320Sbholler
20696320Sbholler	movdqa	%xmm5,%xmm1
20706320Sbholler	#palignr	$0xe,%xmm4,%xmm5
20716320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
20726320Sbholler	.byte	0xec,0x0e
20736320Sbholler	movdqa	%xmm5,0x20(%rcx)
20746320Sbholler
20756320Sbholler	lea	0x30(%rcx),%rcx
20766320Sbholler	jge	L(mov3dqa14)
20776320Sbholler
20786320Sbholler	cmp	$0x10,%r8
20796320Sbholler	jl	L(movdqa_epi)
20806320Sbholler	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
20816320Sbholler	sub	$0x10,%r8
20826320Sbholler	lea	0x10(%rdx),%rdx
20836320Sbholler	movdqa	%xmm3,%xmm2		# save for use next concat
20846320Sbholler	#palignr	$0xe,%xmm1,%xmm3
20856320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
20866320Sbholler	.byte	0xd9,0x0e
20876320Sbholler
20886320Sbholler	cmp	$0x10,%r8
20896320Sbholler	movdqa	%xmm3,(%rcx)      	# store it
20906320Sbholler	lea	0x10(%rcx),%rcx
20916320Sbholler	jl	L(movdqa_epi)
20926320Sbholler
20936320Sbholler	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
20946320Sbholler	sub	$0x10,%r8
20956320Sbholler	lea	0x10(%rdx),%rdx
20966320Sbholler	#palignr	$0xe,%xmm2,%xmm0
20976320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
20986320Sbholler	.byte	0xc2,0x0e
20996320Sbholler	movdqa	%xmm0,(%rcx)      	# store it
21006320Sbholler	lea	0x10(%rcx),%rcx
21016320Sbholler	jmp	L(movdqa_epi)
21026320Sbholler
21036320Sbholler	.balign 16
21046320SbhollerL(mov3dqa15):
21056320Sbholler	movdqa	0x10(%rdx),%xmm3
21066320Sbholler	sub	$0x30,%r8
21076320Sbholler	movdqa	0x20(%rdx),%xmm0
21086320Sbholler	movdqa	0x30(%rdx),%xmm5
21096320Sbholler	lea	0x30(%rdx),%rdx
21106320Sbholler	cmp	$0x30,%r8
21116320Sbholler
21126320Sbholler	movdqa	%xmm3,%xmm2
21136320Sbholler	#palignr	$0xf,%xmm1,%xmm3
21146320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
21156320Sbholler	.byte	0xd9,0x0f
21166320Sbholler	movdqa	%xmm3,(%rcx)
21176320Sbholler
21186320Sbholler	movdqa	%xmm0,%xmm4
21196320Sbholler	#palignr	$0xf,%xmm2,%xmm0
21206320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
21216320Sbholler	.byte	0xc2,0x0f
21226320Sbholler	movdqa	%xmm0,0x10(%rcx)
21236320Sbholler
21246320Sbholler	movdqa	%xmm5,%xmm1
21256320Sbholler	#palignr	$0xf,%xmm4,%xmm5
21266320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
21276320Sbholler	.byte	0xec,0x0f
21286320Sbholler	movdqa	%xmm5,0x20(%rcx)
21296320Sbholler
21306320Sbholler	lea	0x30(%rcx),%rcx
21316320Sbholler	jge	L(mov3dqa15)
21326320Sbholler
21336320Sbholler	cmp	$0x10,%r8
21346320Sbholler	jl	L(movdqa_epi)
21356320Sbholler	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
21366320Sbholler	sub	$0x10,%r8
21376320Sbholler	lea	0x10(%rdx),%rdx
21386320Sbholler	movdqa	%xmm3,%xmm2		# save for use next concat
21396320Sbholler	#palignr	$0xf,%xmm1,%xmm3
21406320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
21416320Sbholler	.byte	0xd9,0x0f
21426320Sbholler
21436320Sbholler	cmp	$0x10,%r8
21446320Sbholler	movdqa	%xmm3,(%rcx)      	# store it
21456320Sbholler	lea	0x10(%rcx),%rcx
21466320Sbholler	jl	L(movdqa_epi)
21476320Sbholler
21486320Sbholler	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
21496320Sbholler	sub	$0x10,%r8
21506320Sbholler	lea	0x10(%rdx),%rdx
21516320Sbholler	#palignr	$0xf,%xmm2,%xmm0
21526320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
21536320Sbholler	.byte	0xc2,0x0f
21546320Sbholler	movdqa	%xmm0,(%rcx)      	# store it
21556320Sbholler	lea	0x10(%rcx),%rcx
21566320Sbholler	jmp	L(movdqa_epi)
21576320Sbholler
21586320Sbholler	.balign 16
21596320SbhollerL(sse2_nt_move):
21606320Sbholler	lea	0x40(%rcx),%rcx
21616320Sbholler	lea	0x40(%rdx),%rdx
21626320Sbholler	lea	-0x40(%r8),%r8
21636320Sbholler
21646320Sbholler	/*
21656320Sbholler	 * doesn't matter if source is aligned for stuff out of cache.
21666320Sbholler	 * the mis-aligned penalty is masked by the slowness of main memory.
21676320Sbholler	 */
21686320Sbholler	prefetchnta 0x180(%rdx)
21696320Sbholler	movdqu	-0x40(%rdx),%xmm0
21706320Sbholler	movdqu	-0x30(%rdx),%xmm1
21716320Sbholler
21726320Sbholler	cmp	$0x40,%r8
21736320Sbholler	movntdq	%xmm0,-0x40(%rcx)
21746320Sbholler	movntdq	%xmm1,-0x30(%rcx)
21756320Sbholler
21766320Sbholler	movdqu	-0x20(%rdx),%xmm2
21776320Sbholler	movdqu	-0x10(%rdx),%xmm3
21786320Sbholler
21796320Sbholler	movntdq	%xmm2,-0x20(%rcx)
21806320Sbholler	movntdq	%xmm3,-0x10(%rcx)
21816320Sbholler
21826320Sbholler	jge	L(sse2_nt_move)
21836320Sbholler
21846320Sbholler	lea	L(Fix16EndTable)(%rip),%r10
21856320Sbholler	mov	%r8,%r9
21866320Sbholler	and	$0xFFFFFFFFFFFFFFF0,%r9
21876320Sbholler	add	%r9,%rcx
21886320Sbholler	add	%r9,%rdx
21896320Sbholler	sub	%r9,%r8
21906320Sbholler	shr	$0x4,%r9
21916320Sbholler	sfence
21926320Sbholler
21936320Sbholler	movslq	(%r10,%r9,4),%r11
21946320Sbholler	lea	(%r11,%r10,1),%r10
21956320Sbholler	jmpq	*%r10
21966320Sbholler
21976320Sbholler	.balign 16
21986320SbhollerL(Fix16EndTable):
21996320Sbholler	.int    L(fix16_0)-L(Fix16EndTable)
22006320Sbholler	.int    L(fix16_1)-L(Fix16EndTable)
22016320Sbholler	.int    L(fix16_2)-L(Fix16EndTable)
22026320Sbholler	.int    L(fix16_3)-L(Fix16EndTable)
22036320Sbholler
22046320Sbholler	.balign 16
22056320SbhollerL(fix16_3):
22066320Sbholler	movdqu -0x30(%rdx),%xmm1
22076320Sbholler	movdqa %xmm1,-0x30(%rcx)
22086320SbhollerL(fix16_2):
22096320Sbholler	movdqu -0x20(%rdx),%xmm2
22106320Sbholler	movdqa %xmm2,-0x20(%rcx)
22116320SbhollerL(fix16_1):
22126320Sbholler	movdqu -0x10(%rdx),%xmm3
22136320Sbholler	movdqa %xmm3,-0x10(%rcx)
22146320SbhollerL(fix16_0):
22156320Sbholler	lea    L(fwdPxQx)(%rip),%r10
22166320Sbholler	add    %r8,%rdx
22176320Sbholler	add    %r8,%rcx
22186320Sbholler
22196320Sbholler	movslq (%r10,%r8,4),%r9
22206320Sbholler	lea    (%r9,%r10,1),%r10
22216320Sbholler	jmpq   *%r10
22226320Sbholler
22236320Sbholler	.balign 16
22246320SbhollerL(pre_both_aligned):
22256320Sbholler	cmp    $0x80,%r8
22266320Sbholler	jl     L(fix_16b)
22276320Sbholler
22286320Sbholler	.balign 16
22296320SbhollerL(both_aligned):
22306320Sbholler
22316320Sbholler	/*
22326320Sbholler	 * this 'paired' load/load/store/store seems to do best.
22336320Sbholler	 */
22346320Sbholler	movdqa (%rdx),%xmm0
22356320Sbholler	movdqa 0x10(%rdx),%xmm1
22366320Sbholler
22376320Sbholler	movdqa %xmm0,(%rcx)
22386320Sbholler	movdqa %xmm1,0x10(%rcx)
22396320Sbholler	lea    -0x80(%r8),%r8
22406320Sbholler
22416320Sbholler	movdqa 0x20(%rdx),%xmm2
22426320Sbholler	movdqa 0x30(%rdx),%xmm3
22436320Sbholler
22446320Sbholler	movdqa %xmm2,0x20(%rcx)
22456320Sbholler	movdqa %xmm3,0x30(%rcx)
22466320Sbholler
22476320Sbholler	movdqa 0x40(%rdx),%xmm0
22486320Sbholler	movdqa 0x50(%rdx),%xmm1
22496320Sbholler	cmp    $0x80,%r8
22506320Sbholler
22516320Sbholler	movdqa %xmm0,0x40(%rcx)
22526320Sbholler	movdqa %xmm1,0x50(%rcx)
22536320Sbholler
22546320Sbholler	movdqa 0x60(%rdx),%xmm2
22556320Sbholler	movdqa 0x70(%rdx),%xmm3
22566320Sbholler	lea    0x80(%rdx),%rdx
22576320Sbholler	movdqa %xmm2,0x60(%rcx)
22586320Sbholler	movdqa %xmm3,0x70(%rcx)
22596320Sbholler	lea    0x80(%rcx),%rcx
22606320Sbholler	jge    L(both_aligned)
22616320Sbholler
22626320SbhollerL(fix_16b):
22636320Sbholler	add    %r8,%rcx
22646320Sbholler	lea    L(fwdPxQx)(%rip),%r10
22656320Sbholler	add    %r8,%rdx
22666320Sbholler
22676320Sbholler	movslq (%r10,%r8,4),%r9
22686320Sbholler	lea    (%r9,%r10,1),%r10
22696320Sbholler	jmpq   *%r10
22706320Sbholler
22716320Sbholler	.balign 16
22726320SbhollerL(Loop8byte_pre):
22736320Sbholler	# Use 8-byte moves
22746320Sbholler	mov    .largest_level_cache_size(%rip),%r9d
22756320Sbholler	shr    %r9		# take half of it
22766320Sbholler	cmp    %r9,%r8
22776320Sbholler	jg     L(byte8_nt_top)
22786320Sbholler	# Find out whether to use rep movsq
22796320Sbholler	cmp    $4096,%r8
22806320Sbholler	jle    L(byte8_top)
22816320Sbholler	mov    .amd64cache1half(%rip),%r9d	# half of l1 cache
22826320Sbholler	cmp    %r9,%r8
22836320Sbholler	jle    L(use_rep)
22846320Sbholler
22856320Sbholler	.balign     16
22866320SbhollerL(byte8_top):
22876320Sbholler	mov    (%rdx),%r9
22886320Sbholler	mov    0x8(%rdx),%r10
22896320Sbholler	lea    -0x40(%r8),%r8
22906320Sbholler	mov    %r9,(%rcx)
22916320Sbholler	mov    %r10,0x8(%rcx)
22926320Sbholler	mov    0x10(%rdx),%r11
22936320Sbholler	mov    0x18(%rdx),%r9
22946320Sbholler	mov    %r11,0x10(%rcx)
22956320Sbholler	mov    %r9,0x18(%rcx)
22966320Sbholler
22976320Sbholler	cmp    $0x40,%r8
22986320Sbholler	mov    0x20(%rdx),%r10
22996320Sbholler	mov    0x28(%rdx),%r11
23006320Sbholler	mov    %r10,0x20(%rcx)
23016320Sbholler	mov    %r11,0x28(%rcx)
23026320Sbholler	mov    0x30(%rdx),%r9
23036320Sbholler	mov    0x38(%rdx),%r10
23046320Sbholler	lea    0x40(%rdx),%rdx
23056320Sbholler	mov    %r9,0x30(%rcx)
23066320Sbholler	mov    %r10,0x38(%rcx)
23076320Sbholler	lea    0x40(%rcx),%rcx
23086320Sbholler	jg     L(byte8_top)
23096320Sbholler
23106320SbhollerL(byte8_end):
23116320Sbholler	lea    L(fwdPxQx)(%rip),%r10
23126320Sbholler	lea    (%rdx,%r8,1),%rdx
23136320Sbholler	lea    (%rcx,%r8,1),%rcx
23146320Sbholler
23156320Sbholler	movslq (%r10,%r8,4),%r9
23166320Sbholler	lea    (%r9,%r10,1),%r10
23176320Sbholler	jmpq   *%r10
23186320Sbholler
23196320Sbholler	.balign	16
23206320SbhollerL(use_rep):
23216320Sbholler	mov    %rdx,%rsi		# %rsi = source
23226320Sbholler	mov    %rcx,%rdi		# %rdi = destination
23236320Sbholler	mov    %r8,%rcx			# %rcx = count
23246320Sbholler	shrq   $3,%rcx			# 8-byte word count
23250Sstevel@tonic-gate	rep
23266320Sbholler	  movsq
23276320Sbholler	mov    %rsi,%rdx		# source
23286320Sbholler	mov    %rdi,%rcx		# destination
23296320Sbholler	andq   $7,%r8			# remainder
23306320Sbholler	jnz    L(byte8_end)
23310Sstevel@tonic-gate	ret
23320Sstevel@tonic-gate
23336320Sbholler	.balign 16
23346320SbhollerL(byte8_nt_top):
23356320Sbholler	sub    $0x40,%r8
23366320Sbholler	prefetchnta 0x180(%rdx)
23376320Sbholler	mov    (%rdx),%r9
23386320Sbholler	movnti %r9,(%rcx)
23396320Sbholler	mov    0x8(%rdx),%r10
23406320Sbholler	movnti %r10,0x8(%rcx)
23416320Sbholler	mov    0x10(%rdx),%r11
23426320Sbholler	movnti %r11,0x10(%rcx)
23436320Sbholler	mov    0x18(%rdx),%r9
23446320Sbholler	movnti %r9,0x18(%rcx)
23456320Sbholler	mov    0x20(%rdx),%r10
23466320Sbholler	movnti %r10,0x20(%rcx)
23476320Sbholler	mov    0x28(%rdx),%r11
23486320Sbholler	movnti %r11,0x28(%rcx)
23496320Sbholler	mov    0x30(%rdx),%r9
23506320Sbholler	movnti %r9,0x30(%rcx)
23516320Sbholler	mov    0x38(%rdx),%r10
23526320Sbholler	movnti %r10,0x38(%rcx)
23536320Sbholler
23546320Sbholler	lea    0x40(%rdx),%rdx
23556320Sbholler	lea    0x40(%rcx),%rcx
23566320Sbholler	cmp    $0x40,%r8
23576320Sbholler	jge    L(byte8_nt_top)
23586320Sbholler	sfence
23596320Sbholler	jmp    L(byte8_end)
23606320Sbholler
23616320Sbholler	SET_SIZE(memcpy)
23626320Sbholler
23636320Sbholler	.balign 16
23646320SbhollerL(CopyBackwards):
23656320Sbholler	mov    %rdx,%r8
23666320Sbholler	mov    %rdi,%rcx
23676320Sbholler	mov    %rsi,%rdx
23686320Sbholler	mov    %rdi,%rax		# return value
23696320Sbholler
23706320Sbholler	# ck alignment of last byte
23716320Sbholler	lea    (%rcx,%r8,1),%rcx
23726320Sbholler	test   $0x7,%rcx
23736320Sbholler	lea    (%rdx,%r8,1),%rdx
23746320Sbholler	jne    L(bk_align)
23756320Sbholler
23766320SbhollerL(bk_qw_aligned):
23776320Sbholler	lea    L(bkPxQx)(%rip),%r10
23786320Sbholler
23796320Sbholler	cmp    $0x90,%r8		# 144
23806320Sbholler	jg     L(bk_ck_sse2_alignment)
23816320Sbholler
23826320Sbholler	sub    %r8,%rcx
23836320Sbholler	sub    %r8,%rdx
23846320Sbholler
23856320Sbholler	movslq (%r10,%r8,4),%r9
23866320Sbholler	lea    (%r9,%r10,1),%r10
23876320Sbholler	jmpq   *%r10
23886320Sbholler
23896320Sbholler	.balign 16
23906320SbhollerL(bk_align):
23916320Sbholler	# only align if len > 8
23926320Sbholler	cmp    $8,%r8
23936320Sbholler	jle    L(bk_qw_aligned)
23946320Sbholler	test   $0x1,%rcx
23956320Sbholler	je     L(bk_tst2)
23966320Sbholler	dec    %rcx
23976320Sbholler	dec    %rdx
23986320Sbholler	dec    %r8
23996320Sbholler	mov    (%rdx),%r9b
24006320Sbholler	mov    %r9b,(%rcx)
24016320Sbholler
24026320SbhollerL(bk_tst2):
24036320Sbholler	test   $0x2,%rcx
24046320Sbholler	je     L(bk_tst3)
24056320Sbholler
24066320SbhollerL(bk_got2):
24076320Sbholler	sub    $0x2,%rcx
24086320Sbholler	sub    $0x2,%rdx
24096320Sbholler	sub    $0x2,%r8
24106320Sbholler	movzwq (%rdx),%r9
24116320Sbholler	mov    %r9w,(%rcx)
24126320Sbholler
24136320SbhollerL(bk_tst3):
24146320Sbholler	test   $0x4,%rcx
24156320Sbholler	je     L(bk_qw_aligned)
24166320Sbholler
24176320SbhollerL(bk_got3):
24186320Sbholler	sub    $0x4,%rcx
24196320Sbholler	sub    $0x4,%rdx
24206320Sbholler	sub    $0x4,%r8
24216320Sbholler	mov    (%rdx),%r9d
24226320Sbholler	mov    %r9d,(%rcx)
24236320Sbholler	jmp    L(bk_qw_aligned)
24246320Sbholler
24256320Sbholler	.balign 16
24266320SbhollerL(bk_ck_sse2_alignment):
24276320Sbholler	cmpl   $NO_SSE,.memops_method(%rip)
24286320Sbholler	je     L(bk_use_rep)
24296320Sbholler	# check alignment of last byte
24306320Sbholler	test   $0xf,%rcx
24316320Sbholler	jz     L(bk_sse2_cpy)
24326320Sbholler
24336320SbhollerL(bk_sse2_align):
24346320Sbholler	# only here if already aligned on at least a qword bndry
24356320Sbholler	sub    $0x8,%rcx
24366320Sbholler	sub    $0x8,%rdx
24376320Sbholler	sub    $0x8,%r8
24386320Sbholler	mov    (%rdx),%r9
24396320Sbholler	mov    %r9,(%rcx)
24406320Sbholler	#jmp   L(bk_sse2_cpy)
24416320Sbholler
24426320Sbholler	.balign 16
24436320SbhollerL(bk_sse2_cpy):
24446320Sbholler	sub    $0x80,%rcx		# 128
24456320Sbholler	sub    $0x80,%rdx
24466320Sbholler	movdqu 0x70(%rdx),%xmm3
24476320Sbholler	movdqu 0x60(%rdx),%xmm2
24486320Sbholler	movdqa %xmm3,0x70(%rcx)
24496320Sbholler	movdqa %xmm2,0x60(%rcx)
24506320Sbholler	sub    $0x80,%r8
24516320Sbholler	movdqu 0x50(%rdx),%xmm1
24526320Sbholler	movdqu 0x40(%rdx),%xmm0
24536320Sbholler	movdqa %xmm1,0x50(%rcx)
24546320Sbholler	movdqa %xmm0,0x40(%rcx)
24556320Sbholler
24566320Sbholler	cmp    $0x80,%r8
24576320Sbholler	movdqu 0x30(%rdx),%xmm3
24586320Sbholler	movdqu 0x20(%rdx),%xmm2
24596320Sbholler	movdqa %xmm3,0x30(%rcx)
24606320Sbholler	movdqa %xmm2,0x20(%rcx)
24616320Sbholler	movdqu 0x10(%rdx),%xmm1
24626320Sbholler	movdqu (%rdx),%xmm0
24636320Sbholler	movdqa %xmm1,0x10(%rcx)
24646320Sbholler	movdqa %xmm0,(%rcx)
24656320Sbholler	jge    L(bk_sse2_cpy)
24666320Sbholler
24676320SbhollerL(bk_sse2_cpy_end):
24686320Sbholler	lea    L(bkPxQx)(%rip),%r10
24696320Sbholler	sub    %r8,%rdx
24706320Sbholler	sub    %r8,%rcx
24716320Sbholler	movslq (%r10,%r8,4),%r9
24726320Sbholler	lea    (%r9,%r10,1),%r10
24736320Sbholler	jmpq   *%r10
24746320Sbholler
24756320Sbholler	.balign 16
24766320SbhollerL(bk_use_rep):
24776320Sbholler	xchg   %rcx,%r9
24786320Sbholler	mov    %rdx,%rsi		# source
24796320Sbholler	mov    %r9,%rdi			# destination
24806320Sbholler	mov    %r8,%rcx			# count
24816320Sbholler	sub    $8,%rsi
24826320Sbholler	sub    $8,%rdi
24836320Sbholler	shr    $3,%rcx
24846320Sbholler	std				# reverse direction
24856320Sbholler	rep
24866320Sbholler	  movsq
24876320Sbholler	cld				# reset direction flag
24886320Sbholler
24896320Sbholler	xchg   %rcx,%r9
24906320Sbholler	lea    L(bkPxQx)(%rip),%r10
24916320Sbholler	sub    %r8,%rdx
24926320Sbholler	sub    %r8,%rcx
24936320Sbholler	andq   $7,%r8			# remainder
24946320Sbholler	jz     2f
24956320Sbholler	movslq (%r10,%r8,4),%r9
24966320Sbholler	lea    (%r9,%r10,1),%r10
24976320Sbholler	jmpq   *%r10
24986320Sbholler2:
24996320Sbholler	ret
25006320Sbholler
25016320Sbholler	.balign 16
25026320SbhollerL(bkP0QI):
25036320Sbholler	mov    0x88(%rdx),%r10
25046320Sbholler	mov    %r10,0x88(%rcx)
25056320SbhollerL(bkP0QH):
25066320Sbholler	mov    0x80(%rdx),%r10
25076320Sbholler	mov    %r10,0x80(%rcx)
25086320SbhollerL(bkP0QG):
25096320Sbholler	mov    0x78(%rdx),%r9
25106320Sbholler	mov    %r9,0x78(%rcx)
25116320SbhollerL(bkP0QF):
25126320Sbholler	mov    0x70(%rdx),%r11
25136320Sbholler	mov    %r11,0x70(%rcx)
25146320SbhollerL(bkP0QE):
25156320Sbholler	mov    0x68(%rdx),%r10
25166320Sbholler	mov    %r10,0x68(%rcx)
25176320SbhollerL(bkP0QD):
25186320Sbholler	mov    0x60(%rdx),%r9
25196320Sbholler	mov    %r9,0x60(%rcx)
25206320SbhollerL(bkP0QC):
25216320Sbholler	mov    0x58(%rdx),%r11
25226320Sbholler	mov    %r11,0x58(%rcx)
25236320SbhollerL(bkP0QB):
25246320Sbholler	mov    0x50(%rdx),%r10
25256320Sbholler	mov    %r10,0x50(%rcx)
25266320SbhollerL(bkP0QA):
25276320Sbholler	mov    0x48(%rdx),%r9
25286320Sbholler	mov    %r9,0x48(%rcx)
25296320SbhollerL(bkP0Q9):
25306320Sbholler	mov    0x40(%rdx),%r11
25316320Sbholler	mov    %r11,0x40(%rcx)
25326320SbhollerL(bkP0Q8):
25336320Sbholler	mov    0x38(%rdx),%r10
25346320Sbholler	mov    %r10,0x38(%rcx)
25356320SbhollerL(bkP0Q7):
25366320Sbholler	mov    0x30(%rdx),%r9
25376320Sbholler	mov    %r9,0x30(%rcx)
25386320SbhollerL(bkP0Q6):
25396320Sbholler	mov    0x28(%rdx),%r11
25406320Sbholler	mov    %r11,0x28(%rcx)
25416320SbhollerL(bkP0Q5):
25426320Sbholler	mov    0x20(%rdx),%r10
25436320Sbholler	mov    %r10,0x20(%rcx)
25446320SbhollerL(bkP0Q4):
25456320Sbholler	mov    0x18(%rdx),%r9
25466320Sbholler	mov    %r9,0x18(%rcx)
25476320SbhollerL(bkP0Q3):
25486320Sbholler	mov    0x10(%rdx),%r11
25496320Sbholler	mov    %r11,0x10(%rcx)
25506320SbhollerL(bkP0Q2):
25516320Sbholler	mov    0x8(%rdx),%r10
25526320Sbholler	mov    %r10,0x8(%rcx)
25536320SbhollerL(bkP0Q1):
25546320Sbholler	mov    (%rdx),%r9
25556320Sbholler	mov    %r9,(%rcx)
25566320SbhollerL(bkP0Q0):
25576320Sbholler	ret
25586320Sbholler
25596320Sbholler	.balign 16
25606320SbhollerL(bkP1QI):
25616320Sbholler	mov    0x89(%rdx),%r10
25626320Sbholler	mov    %r10,0x89(%rcx)
25636320SbhollerL(bkP1QH):
25646320Sbholler	mov    0x81(%rdx),%r11
25656320Sbholler	mov    %r11,0x81(%rcx)
25666320SbhollerL(bkP1QG):
25676320Sbholler	mov    0x79(%rdx),%r10
25686320Sbholler	mov    %r10,0x79(%rcx)
25696320SbhollerL(bkP1QF):
25706320Sbholler	mov    0x71(%rdx),%r9
25716320Sbholler	mov    %r9,0x71(%rcx)
25726320SbhollerL(bkP1QE):
25736320Sbholler	mov    0x69(%rdx),%r11
25746320Sbholler	mov    %r11,0x69(%rcx)
25756320SbhollerL(bkP1QD):
25766320Sbholler	mov    0x61(%rdx),%r10
25776320Sbholler	mov    %r10,0x61(%rcx)
25786320SbhollerL(bkP1QC):
25796320Sbholler	mov    0x59(%rdx),%r9
25806320Sbholler	mov    %r9,0x59(%rcx)
25816320SbhollerL(bkP1QB):
25826320Sbholler	mov    0x51(%rdx),%r11
25836320Sbholler	mov    %r11,0x51(%rcx)
25846320SbhollerL(bkP1QA):
25856320Sbholler	mov    0x49(%rdx),%r10
25866320Sbholler	mov    %r10,0x49(%rcx)
25876320SbhollerL(bkP1Q9):
25886320Sbholler	mov    0x41(%rdx),%r9
25896320Sbholler	mov    %r9,0x41(%rcx)
25906320SbhollerL(bkP1Q8):
25916320Sbholler	mov    0x39(%rdx),%r11
25926320Sbholler	mov    %r11,0x39(%rcx)
25936320SbhollerL(bkP1Q7):
25946320Sbholler	mov    0x31(%rdx),%r10
25956320Sbholler	mov    %r10,0x31(%rcx)
25966320SbhollerL(bkP1Q6):
25976320Sbholler	mov    0x29(%rdx),%r9
25986320Sbholler	mov    %r9,0x29(%rcx)
25996320SbhollerL(bkP1Q5):
26006320Sbholler	mov    0x21(%rdx),%r11
26016320Sbholler	mov    %r11,0x21(%rcx)
26026320SbhollerL(bkP1Q4):
26036320Sbholler	mov    0x19(%rdx),%r10
26046320Sbholler	mov    %r10,0x19(%rcx)
26056320SbhollerL(bkP1Q3):
26066320Sbholler	mov    0x11(%rdx),%r9
26076320Sbholler	mov    %r9,0x11(%rcx)
26086320SbhollerL(bkP1Q2):
26096320Sbholler	mov    0x9(%rdx),%r11
26106320Sbholler	mov    %r11,0x9(%rcx)
26116320SbhollerL(bkP1Q1):
26126320Sbholler	mov    0x1(%rdx),%r10
26136320Sbholler	mov    %r10,0x1(%rcx)
26146320SbhollerL(bkP1Q0):
26156320Sbholler	mov    (%rdx),%r9b
26166320Sbholler	mov    %r9b,(%rcx)
26176320Sbholler	ret
26186320Sbholler
26196320Sbholler	.balign 16
26206320SbhollerL(bkP2QI):
26216320Sbholler	mov    0x8a(%rdx),%r10
26226320Sbholler	mov    %r10,0x8a(%rcx)
26236320SbhollerL(bkP2QH):
26246320Sbholler	mov    0x82(%rdx),%r11
26256320Sbholler	mov    %r11,0x82(%rcx)
26266320SbhollerL(bkP2QG):
26276320Sbholler	mov    0x7a(%rdx),%r10
26286320Sbholler	mov    %r10,0x7a(%rcx)
26296320SbhollerL(bkP2QF):
26306320Sbholler	mov    0x72(%rdx),%r9
26316320Sbholler	mov    %r9,0x72(%rcx)
26326320SbhollerL(bkP2QE):
26336320Sbholler	mov    0x6a(%rdx),%r11
26346320Sbholler	mov    %r11,0x6a(%rcx)
26356320SbhollerL(bkP2QD):
26366320Sbholler	mov    0x62(%rdx),%r10
26376320Sbholler	mov    %r10,0x62(%rcx)
26386320SbhollerL(bkP2QC):
26396320Sbholler	mov    0x5a(%rdx),%r9
26406320Sbholler	mov    %r9,0x5a(%rcx)
26416320SbhollerL(bkP2QB):
26426320Sbholler	mov    0x52(%rdx),%r11
26436320Sbholler	mov    %r11,0x52(%rcx)
26446320SbhollerL(bkP2QA):
26456320Sbholler	mov    0x4a(%rdx),%r10
26466320Sbholler	mov    %r10,0x4a(%rcx)
26476320SbhollerL(bkP2Q9):
26486320Sbholler	mov    0x42(%rdx),%r9
26496320Sbholler	mov    %r9,0x42(%rcx)
26506320SbhollerL(bkP2Q8):
26516320Sbholler	mov    0x3a(%rdx),%r11
26526320Sbholler	mov    %r11,0x3a(%rcx)
26536320SbhollerL(bkP2Q7):
26546320Sbholler	mov    0x32(%rdx),%r10
26556320Sbholler	mov    %r10,0x32(%rcx)
26566320SbhollerL(bkP2Q6):
26576320Sbholler	mov    0x2a(%rdx),%r9
26586320Sbholler	mov    %r9,0x2a(%rcx)
26596320SbhollerL(bkP2Q5):
26606320Sbholler	mov    0x22(%rdx),%r11
26616320Sbholler	mov    %r11,0x22(%rcx)
26626320SbhollerL(bkP2Q4):
26636320Sbholler	mov    0x1a(%rdx),%r10
26646320Sbholler	mov    %r10,0x1a(%rcx)
26656320SbhollerL(bkP2Q3):
26666320Sbholler	mov    0x12(%rdx),%r9
26676320Sbholler	mov    %r9,0x12(%rcx)
26686320SbhollerL(bkP2Q2):
26696320Sbholler	mov    0xa(%rdx),%r11
26706320Sbholler	mov    %r11,0xa(%rcx)
26716320SbhollerL(bkP2Q1):
26726320Sbholler	mov    0x2(%rdx),%r10
26736320Sbholler	mov    %r10,0x2(%rcx)
26746320SbhollerL(bkP2Q0):
26756320Sbholler	mov    (%rdx),%r9w
26766320Sbholler	mov    %r9w,(%rcx)
26776320Sbholler	ret
26786320Sbholler
26796320Sbholler	.balign 16
26806320SbhollerL(bkP3QI):
26816320Sbholler	mov    0x8b(%rdx),%r10
26826320Sbholler	mov    %r10,0x8b(%rcx)
26836320SbhollerL(bkP3QH):
26846320Sbholler	mov    0x83(%rdx),%r11
26856320Sbholler	mov    %r11,0x83(%rcx)
26866320SbhollerL(bkP3QG):
26876320Sbholler	mov    0x7b(%rdx),%r10
26886320Sbholler	mov    %r10,0x7b(%rcx)
26896320SbhollerL(bkP3QF):
26906320Sbholler	mov    0x73(%rdx),%r9
26916320Sbholler	mov    %r9,0x73(%rcx)
26926320SbhollerL(bkP3QE):
26936320Sbholler	mov    0x6b(%rdx),%r11
26946320Sbholler	mov    %r11,0x6b(%rcx)
26956320SbhollerL(bkP3QD):
26966320Sbholler	mov    0x63(%rdx),%r10
26976320Sbholler	mov    %r10,0x63(%rcx)
26986320SbhollerL(bkP3QC):
26996320Sbholler	mov    0x5b(%rdx),%r9
27006320Sbholler	mov    %r9,0x5b(%rcx)
27016320SbhollerL(bkP3QB):
27026320Sbholler	mov    0x53(%rdx),%r11
27036320Sbholler	mov    %r11,0x53(%rcx)
27046320SbhollerL(bkP3QA):
27056320Sbholler	mov    0x4b(%rdx),%r10
27066320Sbholler	mov    %r10,0x4b(%rcx)
27076320SbhollerL(bkP3Q9):
27086320Sbholler	mov    0x43(%rdx),%r9
27096320Sbholler	mov    %r9,0x43(%rcx)
27106320SbhollerL(bkP3Q8):
27116320Sbholler	mov    0x3b(%rdx),%r11
27126320Sbholler	mov    %r11,0x3b(%rcx)
27136320SbhollerL(bkP3Q7):
27146320Sbholler	mov    0x33(%rdx),%r10
27156320Sbholler	mov    %r10,0x33(%rcx)
27166320SbhollerL(bkP3Q6):
27176320Sbholler	mov    0x2b(%rdx),%r9
27186320Sbholler	mov    %r9,0x2b(%rcx)
27196320SbhollerL(bkP3Q5):
27206320Sbholler	mov    0x23(%rdx),%r11
27216320Sbholler	mov    %r11,0x23(%rcx)
27226320SbhollerL(bkP3Q4):
27236320Sbholler	mov    0x1b(%rdx),%r10
27246320Sbholler	mov    %r10,0x1b(%rcx)
27256320SbhollerL(bkP3Q3):
27266320Sbholler	mov    0x13(%rdx),%r9
27276320Sbholler	mov    %r9,0x13(%rcx)
27286320SbhollerL(bkP3Q2):
27296320Sbholler	mov    0xb(%rdx),%r11
27306320Sbholler	mov    %r11,0xb(%rcx)
27316320SbhollerL(bkP3Q1):
27326320Sbholler	mov    0x3(%rdx),%r10
27336320Sbholler	mov    %r10,0x3(%rcx)
27346320SbhollerL(bkP3Q0): # trailing loads/stores do all their loads 1st, then do the stores
27356320Sbholler	mov    0x1(%rdx),%r9w
27366320Sbholler	mov    %r9w,0x1(%rcx)
27376320Sbholler	mov    (%rdx),%r10b
27386320Sbholler	mov    %r10b,(%rcx)
27396320Sbholler	ret
27406320Sbholler
27416320Sbholler	.balign 16
27426320SbhollerL(bkP4QI):
27436320Sbholler	mov    0x8c(%rdx),%r10
27446320Sbholler	mov    %r10,0x8c(%rcx)
27456320SbhollerL(bkP4QH):
27466320Sbholler	mov    0x84(%rdx),%r11
27476320Sbholler	mov    %r11,0x84(%rcx)
27486320SbhollerL(bkP4QG):
27496320Sbholler	mov    0x7c(%rdx),%r10
27506320Sbholler	mov    %r10,0x7c(%rcx)
27516320SbhollerL(bkP4QF):
27526320Sbholler	mov    0x74(%rdx),%r9
27536320Sbholler	mov    %r9,0x74(%rcx)
27546320SbhollerL(bkP4QE):
27556320Sbholler	mov    0x6c(%rdx),%r11
27566320Sbholler	mov    %r11,0x6c(%rcx)
27576320SbhollerL(bkP4QD):
27586320Sbholler	mov    0x64(%rdx),%r10
27596320Sbholler	mov    %r10,0x64(%rcx)
27606320SbhollerL(bkP4QC):
27616320Sbholler	mov    0x5c(%rdx),%r9
27626320Sbholler	mov    %r9,0x5c(%rcx)
27636320SbhollerL(bkP4QB):
27646320Sbholler	mov    0x54(%rdx),%r11
27656320Sbholler	mov    %r11,0x54(%rcx)
27666320SbhollerL(bkP4QA):
27676320Sbholler	mov    0x4c(%rdx),%r10
27686320Sbholler	mov    %r10,0x4c(%rcx)
27696320SbhollerL(bkP4Q9):
27706320Sbholler	mov    0x44(%rdx),%r9
27716320Sbholler	mov    %r9,0x44(%rcx)
27726320SbhollerL(bkP4Q8):
27736320Sbholler	mov    0x3c(%rdx),%r11
27746320Sbholler	mov    %r11,0x3c(%rcx)
27756320SbhollerL(bkP4Q7):
27766320Sbholler	mov    0x34(%rdx),%r10
27776320Sbholler	mov    %r10,0x34(%rcx)
27786320SbhollerL(bkP4Q6):
27796320Sbholler	mov    0x2c(%rdx),%r9
27806320Sbholler	mov    %r9,0x2c(%rcx)
27816320SbhollerL(bkP4Q5):
27826320Sbholler	mov    0x24(%rdx),%r11
27836320Sbholler	mov    %r11,0x24(%rcx)
27846320SbhollerL(bkP4Q4):
27856320Sbholler	mov    0x1c(%rdx),%r10
27866320Sbholler	mov    %r10,0x1c(%rcx)
27876320SbhollerL(bkP4Q3):
27886320Sbholler	mov    0x14(%rdx),%r9
27896320Sbholler	mov    %r9,0x14(%rcx)
27906320SbhollerL(bkP4Q2):
27916320Sbholler	mov    0xc(%rdx),%r11
27926320Sbholler	mov    %r11,0xc(%rcx)
27936320SbhollerL(bkP4Q1):
27946320Sbholler	mov    0x4(%rdx),%r10
27956320Sbholler	mov    %r10,0x4(%rcx)
27966320SbhollerL(bkP4Q0):
27976320Sbholler	mov    (%rdx),%r9d
27986320Sbholler	mov    %r9d,(%rcx)
27996320Sbholler	ret
28006320Sbholler
28016320Sbholler	.balign 16
28026320SbhollerL(bkP5QI):
28036320Sbholler	mov    0x8d(%rdx),%r10
28046320Sbholler	mov    %r10,0x8d(%rcx)
28056320SbhollerL(bkP5QH):
28066320Sbholler	mov    0x85(%rdx),%r9
28076320Sbholler	mov    %r9,0x85(%rcx)
28086320SbhollerL(bkP5QG):
28096320Sbholler	mov    0x7d(%rdx),%r11
28106320Sbholler	mov    %r11,0x7d(%rcx)
28116320SbhollerL(bkP5QF):
28126320Sbholler	mov    0x75(%rdx),%r10
28136320Sbholler	mov    %r10,0x75(%rcx)
28146320SbhollerL(bkP5QE):
28156320Sbholler	mov    0x6d(%rdx),%r9
28166320Sbholler	mov    %r9,0x6d(%rcx)
28176320SbhollerL(bkP5QD):
28186320Sbholler	mov    0x65(%rdx),%r11
28196320Sbholler	mov    %r11,0x65(%rcx)
28206320SbhollerL(bkP5QC):
28216320Sbholler	mov    0x5d(%rdx),%r10
28226320Sbholler	mov    %r10,0x5d(%rcx)
28236320SbhollerL(bkP5QB):
28246320Sbholler	mov    0x55(%rdx),%r9
28256320Sbholler	mov    %r9,0x55(%rcx)
28266320SbhollerL(bkP5QA):
28276320Sbholler	mov    0x4d(%rdx),%r11
28286320Sbholler	mov    %r11,0x4d(%rcx)
28296320SbhollerL(bkP5Q9):
28306320Sbholler	mov    0x45(%rdx),%r10
28316320Sbholler	mov    %r10,0x45(%rcx)
28326320SbhollerL(bkP5Q8):
28336320Sbholler	mov    0x3d(%rdx),%r9
28346320Sbholler	mov    %r9,0x3d(%rcx)
28356320SbhollerL(bkP5Q7):
28366320Sbholler	mov    0x35(%rdx),%r11
28376320Sbholler	mov    %r11,0x35(%rcx)
28386320SbhollerL(bkP5Q6):
28396320Sbholler	mov    0x2d(%rdx),%r10
28406320Sbholler	mov    %r10,0x2d(%rcx)
28416320SbhollerL(bkP5Q5):
28426320Sbholler	mov    0x25(%rdx),%r9
28436320Sbholler	mov    %r9,0x25(%rcx)
28446320SbhollerL(bkP5Q4):
28456320Sbholler	mov    0x1d(%rdx),%r11
28466320Sbholler	mov    %r11,0x1d(%rcx)
28476320SbhollerL(bkP5Q3):
28486320Sbholler	mov    0x15(%rdx),%r10
28496320Sbholler	mov    %r10,0x15(%rcx)
28506320SbhollerL(bkP5Q2):
28516320Sbholler	mov    0xd(%rdx),%r9
28526320Sbholler	mov    %r9,0xd(%rcx)
28536320SbhollerL(bkP5Q1):
28546320Sbholler	mov    0x5(%rdx),%r11
28556320Sbholler	mov    %r11,0x5(%rcx)
28566320SbhollerL(bkP5Q0): # trailing loads/stores do all their loads 1st, then do the stores
28576320Sbholler	mov    0x1(%rdx),%r9d
28586320Sbholler	mov    %r9d,0x1(%rcx)
28596320Sbholler	mov    (%rdx),%r10b
28606320Sbholler	mov    %r10b,(%rcx)
28616320Sbholler	ret
28626320Sbholler
28636320Sbholler	.balign 16
28646320SbhollerL(bkP6QI):
28656320Sbholler	mov    0x8e(%rdx),%r10
28666320Sbholler	mov    %r10,0x8e(%rcx)
28676320SbhollerL(bkP6QH):
28686320Sbholler	mov    0x86(%rdx),%r11
28696320Sbholler	mov    %r11,0x86(%rcx)
28706320SbhollerL(bkP6QG):
28716320Sbholler	mov    0x7e(%rdx),%r10
28726320Sbholler	mov    %r10,0x7e(%rcx)
28736320SbhollerL(bkP6QF):
28746320Sbholler	mov    0x76(%rdx),%r9
28756320Sbholler	mov    %r9,0x76(%rcx)
28766320SbhollerL(bkP6QE):
28776320Sbholler	mov    0x6e(%rdx),%r11
28786320Sbholler	mov    %r11,0x6e(%rcx)
28796320SbhollerL(bkP6QD):
28806320Sbholler	mov    0x66(%rdx),%r10
28816320Sbholler	mov    %r10,0x66(%rcx)
28826320SbhollerL(bkP6QC):
28836320Sbholler	mov    0x5e(%rdx),%r9
28846320Sbholler	mov    %r9,0x5e(%rcx)
28856320SbhollerL(bkP6QB):
28866320Sbholler	mov    0x56(%rdx),%r11
28876320Sbholler	mov    %r11,0x56(%rcx)
28886320SbhollerL(bkP6QA):
28896320Sbholler	mov    0x4e(%rdx),%r10
28906320Sbholler	mov    %r10,0x4e(%rcx)
28916320SbhollerL(bkP6Q9):
28926320Sbholler	mov    0x46(%rdx),%r9
28936320Sbholler	mov    %r9,0x46(%rcx)
28946320SbhollerL(bkP6Q8):
28956320Sbholler	mov    0x3e(%rdx),%r11
28966320Sbholler	mov    %r11,0x3e(%rcx)
28976320SbhollerL(bkP6Q7):
28986320Sbholler	mov    0x36(%rdx),%r10
28996320Sbholler	mov    %r10,0x36(%rcx)
29006320SbhollerL(bkP6Q6):
29016320Sbholler	mov    0x2e(%rdx),%r9
29026320Sbholler	mov    %r9,0x2e(%rcx)
29036320SbhollerL(bkP6Q5):
29046320Sbholler	mov    0x26(%rdx),%r11
29056320Sbholler	mov    %r11,0x26(%rcx)
29066320SbhollerL(bkP6Q4):
29076320Sbholler	mov    0x1e(%rdx),%r10
29086320Sbholler	mov    %r10,0x1e(%rcx)
29096320SbhollerL(bkP6Q3):
29106320Sbholler	mov    0x16(%rdx),%r9
29116320Sbholler	mov    %r9,0x16(%rcx)
29126320SbhollerL(bkP6Q2):
29136320Sbholler	mov    0xe(%rdx),%r11
29146320Sbholler	mov    %r11,0xe(%rcx)
29156320SbhollerL(bkP6Q1):
29166320Sbholler	mov    0x6(%rdx),%r10
29176320Sbholler	mov    %r10,0x6(%rcx)
29186320SbhollerL(bkP6Q0): # trailing loads/stores do all their loads 1st, then do the stores
29196320Sbholler	mov    0x2(%rdx),%r9d
29206320Sbholler	mov    %r9d,0x2(%rcx)
29216320Sbholler	mov    (%rdx),%r10w
29226320Sbholler	mov    %r10w,(%rcx)
29236320Sbholler	ret
29246320Sbholler
29256320Sbholler	.balign 16
29266320SbhollerL(bkP7QI):
29276320Sbholler	mov    0x8f(%rdx),%r10
29286320Sbholler	mov    %r10,0x8f(%rcx)
29296320SbhollerL(bkP7QH):
29306320Sbholler	mov    0x87(%rdx),%r11
29316320Sbholler	mov    %r11,0x87(%rcx)
29326320SbhollerL(bkP7QG):
29336320Sbholler	mov    0x7f(%rdx),%r10
29346320Sbholler	mov    %r10,0x7f(%rcx)
29356320SbhollerL(bkP7QF):
29366320Sbholler	mov    0x77(%rdx),%r9
29376320Sbholler	mov    %r9,0x77(%rcx)
29386320SbhollerL(bkP7QE):
29396320Sbholler	mov    0x6f(%rdx),%r11
29406320Sbholler	mov    %r11,0x6f(%rcx)
29416320SbhollerL(bkP7QD):
29426320Sbholler	mov    0x67(%rdx),%r10
29436320Sbholler	mov    %r10,0x67(%rcx)
29446320SbhollerL(bkP7QC):
29456320Sbholler	mov    0x5f(%rdx),%r9
29466320Sbholler	mov    %r9,0x5f(%rcx)
29476320SbhollerL(bkP7QB):
29486320Sbholler	mov    0x57(%rdx),%r11
29496320Sbholler	mov    %r11,0x57(%rcx)
29506320SbhollerL(bkP7QA):
29516320Sbholler	mov    0x4f(%rdx),%r10
29526320Sbholler	mov    %r10,0x4f(%rcx)
29536320SbhollerL(bkP7Q9):
29546320Sbholler	mov    0x47(%rdx),%r9
29556320Sbholler	mov    %r9,0x47(%rcx)
29566320SbhollerL(bkP7Q8):
29576320Sbholler	mov    0x3f(%rdx),%r11
29586320Sbholler	mov    %r11,0x3f(%rcx)
29596320SbhollerL(bkP7Q7):
29606320Sbholler	mov    0x37(%rdx),%r10
29616320Sbholler	mov    %r10,0x37(%rcx)
29626320SbhollerL(bkP7Q6):
29636320Sbholler	mov    0x2f(%rdx),%r9
29646320Sbholler	mov    %r9,0x2f(%rcx)
29656320SbhollerL(bkP7Q5):
29666320Sbholler	mov    0x27(%rdx),%r11
29676320Sbholler	mov    %r11,0x27(%rcx)
29686320SbhollerL(bkP7Q4):
29696320Sbholler	mov    0x1f(%rdx),%r10
29706320Sbholler	mov    %r10,0x1f(%rcx)
29716320SbhollerL(bkP7Q3):
29726320Sbholler	mov    0x17(%rdx),%r9
29736320Sbholler	mov    %r9,0x17(%rcx)
29746320SbhollerL(bkP7Q2):
29756320Sbholler	mov    0xf(%rdx),%r11
29766320Sbholler	mov    %r11,0xf(%rcx)
29776320SbhollerL(bkP7Q1):
29786320Sbholler	mov    0x7(%rdx),%r10
29796320Sbholler	mov    %r10,0x7(%rcx)
29806320SbhollerL(bkP7Q0): # trailing loads/stores do all their loads 1st, then do the stores
29816320Sbholler	mov    0x3(%rdx),%r9d
29826320Sbholler	mov    %r9d,0x3(%rcx)
29836320Sbholler	mov    0x1(%rdx),%r10w
29846320Sbholler	mov    %r10w,0x1(%rcx)
29856320Sbholler	mov    (%rdx),%r11b
29866320Sbholler	mov    %r11b,(%rcx)
29876320Sbholler	ret
29886320Sbholler
29896320Sbholler		.balign 16
29906320SbhollerL(bkPxQx):	.int L(bkP0Q0)-L(bkPxQx)
29916320Sbholler		.int L(bkP1Q0)-L(bkPxQx)
29926320Sbholler		.int L(bkP2Q0)-L(bkPxQx)
29936320Sbholler		.int L(bkP3Q0)-L(bkPxQx)
29946320Sbholler		.int L(bkP4Q0)-L(bkPxQx)
29956320Sbholler		.int L(bkP5Q0)-L(bkPxQx)
29966320Sbholler		.int L(bkP6Q0)-L(bkPxQx)
29976320Sbholler		.int L(bkP7Q0)-L(bkPxQx)
29986320Sbholler
29996320Sbholler		.int L(bkP0Q1)-L(bkPxQx)
30006320Sbholler		.int L(bkP1Q1)-L(bkPxQx)
30016320Sbholler		.int L(bkP2Q1)-L(bkPxQx)
30026320Sbholler		.int L(bkP3Q1)-L(bkPxQx)
30036320Sbholler		.int L(bkP4Q1)-L(bkPxQx)
30046320Sbholler		.int L(bkP5Q1)-L(bkPxQx)
30056320Sbholler		.int L(bkP6Q1)-L(bkPxQx)
30066320Sbholler		.int L(bkP7Q1)-L(bkPxQx)
30076320Sbholler
30086320Sbholler		.int L(bkP0Q2)-L(bkPxQx)
30096320Sbholler		.int L(bkP1Q2)-L(bkPxQx)
30106320Sbholler		.int L(bkP2Q2)-L(bkPxQx)
30116320Sbholler		.int L(bkP3Q2)-L(bkPxQx)
30126320Sbholler		.int L(bkP4Q2)-L(bkPxQx)
30136320Sbholler		.int L(bkP5Q2)-L(bkPxQx)
30146320Sbholler		.int L(bkP6Q2)-L(bkPxQx)
30156320Sbholler		.int L(bkP7Q2)-L(bkPxQx)
30166320Sbholler
30176320Sbholler		.int L(bkP0Q3)-L(bkPxQx)
30186320Sbholler		.int L(bkP1Q3)-L(bkPxQx)
30196320Sbholler		.int L(bkP2Q3)-L(bkPxQx)
30206320Sbholler		.int L(bkP3Q3)-L(bkPxQx)
30216320Sbholler		.int L(bkP4Q3)-L(bkPxQx)
30226320Sbholler		.int L(bkP5Q3)-L(bkPxQx)
30236320Sbholler		.int L(bkP6Q3)-L(bkPxQx)
30246320Sbholler		.int L(bkP7Q3)-L(bkPxQx)
30256320Sbholler
30266320Sbholler		.int L(bkP0Q4)-L(bkPxQx)
30276320Sbholler		.int L(bkP1Q4)-L(bkPxQx)
30286320Sbholler		.int L(bkP2Q4)-L(bkPxQx)
30296320Sbholler		.int L(bkP3Q4)-L(bkPxQx)
30306320Sbholler		.int L(bkP4Q4)-L(bkPxQx)
30316320Sbholler		.int L(bkP5Q4)-L(bkPxQx)
30326320Sbholler		.int L(bkP6Q4)-L(bkPxQx)
30336320Sbholler		.int L(bkP7Q4)-L(bkPxQx)
30346320Sbholler
30356320Sbholler		.int L(bkP0Q5)-L(bkPxQx)
30366320Sbholler		.int L(bkP1Q5)-L(bkPxQx)
30376320Sbholler		.int L(bkP2Q5)-L(bkPxQx)
30386320Sbholler		.int L(bkP3Q5)-L(bkPxQx)
30396320Sbholler		.int L(bkP4Q5)-L(bkPxQx)
30406320Sbholler		.int L(bkP5Q5)-L(bkPxQx)
30416320Sbholler		.int L(bkP6Q5)-L(bkPxQx)
30426320Sbholler		.int L(bkP7Q5)-L(bkPxQx)
30436320Sbholler
30446320Sbholler		.int L(bkP0Q6)-L(bkPxQx)
30456320Sbholler		.int L(bkP1Q6)-L(bkPxQx)
30466320Sbholler		.int L(bkP2Q6)-L(bkPxQx)
30476320Sbholler		.int L(bkP3Q6)-L(bkPxQx)
30486320Sbholler		.int L(bkP4Q6)-L(bkPxQx)
30496320Sbholler		.int L(bkP5Q6)-L(bkPxQx)
30506320Sbholler		.int L(bkP6Q6)-L(bkPxQx)
30516320Sbholler		.int L(bkP7Q6)-L(bkPxQx)
30526320Sbholler
30536320Sbholler		.int L(bkP0Q7)-L(bkPxQx)
30546320Sbholler		.int L(bkP1Q7)-L(bkPxQx)
30556320Sbholler		.int L(bkP2Q7)-L(bkPxQx)
30566320Sbholler		.int L(bkP3Q7)-L(bkPxQx)
30576320Sbholler		.int L(bkP4Q7)-L(bkPxQx)
30586320Sbholler		.int L(bkP5Q7)-L(bkPxQx)
30596320Sbholler		.int L(bkP6Q7)-L(bkPxQx)
30606320Sbholler		.int L(bkP7Q7)-L(bkPxQx)
30616320Sbholler
30626320Sbholler		.int L(bkP0Q8)-L(bkPxQx)
30636320Sbholler		.int L(bkP1Q8)-L(bkPxQx)
30646320Sbholler		.int L(bkP2Q8)-L(bkPxQx)
30656320Sbholler		.int L(bkP3Q8)-L(bkPxQx)
30666320Sbholler		.int L(bkP4Q8)-L(bkPxQx)
30676320Sbholler		.int L(bkP5Q8)-L(bkPxQx)
30686320Sbholler		.int L(bkP6Q8)-L(bkPxQx)
30696320Sbholler		.int L(bkP7Q8)-L(bkPxQx)
30706320Sbholler
30716320Sbholler		.int L(bkP0Q9)-L(bkPxQx)
30726320Sbholler		.int L(bkP1Q9)-L(bkPxQx)
30736320Sbholler		.int L(bkP2Q9)-L(bkPxQx)
30746320Sbholler		.int L(bkP3Q9)-L(bkPxQx)
30756320Sbholler		.int L(bkP4Q9)-L(bkPxQx)
30766320Sbholler		.int L(bkP5Q9)-L(bkPxQx)
30776320Sbholler		.int L(bkP6Q9)-L(bkPxQx)
30786320Sbholler		.int L(bkP7Q9)-L(bkPxQx)
30796320Sbholler
30806320Sbholler		.int L(bkP0QA)-L(bkPxQx)
30816320Sbholler		.int L(bkP1QA)-L(bkPxQx)
30826320Sbholler		.int L(bkP2QA)-L(bkPxQx)
30836320Sbholler		.int L(bkP3QA)-L(bkPxQx)
30846320Sbholler		.int L(bkP4QA)-L(bkPxQx)
30856320Sbholler		.int L(bkP5QA)-L(bkPxQx)
30866320Sbholler		.int L(bkP6QA)-L(bkPxQx)
30876320Sbholler		.int L(bkP7QA)-L(bkPxQx)
30886320Sbholler
30896320Sbholler		.int L(bkP0QB)-L(bkPxQx)
30906320Sbholler		.int L(bkP1QB)-L(bkPxQx)
30916320Sbholler		.int L(bkP2QB)-L(bkPxQx)
30926320Sbholler		.int L(bkP3QB)-L(bkPxQx)
30936320Sbholler		.int L(bkP4QB)-L(bkPxQx)
30946320Sbholler		.int L(bkP5QB)-L(bkPxQx)
30956320Sbholler		.int L(bkP6QB)-L(bkPxQx)
30966320Sbholler		.int L(bkP7QB)-L(bkPxQx)
30976320Sbholler
30986320Sbholler		.int L(bkP0QC)-L(bkPxQx)
30996320Sbholler		.int L(bkP1QC)-L(bkPxQx)
31006320Sbholler		.int L(bkP2QC)-L(bkPxQx)
31016320Sbholler		.int L(bkP3QC)-L(bkPxQx)
31026320Sbholler		.int L(bkP4QC)-L(bkPxQx)
31036320Sbholler		.int L(bkP5QC)-L(bkPxQx)
31046320Sbholler		.int L(bkP6QC)-L(bkPxQx)
31056320Sbholler		.int L(bkP7QC)-L(bkPxQx)
31066320Sbholler
31076320Sbholler		.int L(bkP0QD)-L(bkPxQx)
31086320Sbholler		.int L(bkP1QD)-L(bkPxQx)
31096320Sbholler		.int L(bkP2QD)-L(bkPxQx)
31106320Sbholler		.int L(bkP3QD)-L(bkPxQx)
31116320Sbholler		.int L(bkP4QD)-L(bkPxQx)
31126320Sbholler		.int L(bkP5QD)-L(bkPxQx)
31136320Sbholler		.int L(bkP6QD)-L(bkPxQx)
31146320Sbholler		.int L(bkP7QD)-L(bkPxQx)
31156320Sbholler
31166320Sbholler		.int L(bkP0QE)-L(bkPxQx)
31176320Sbholler		.int L(bkP1QE)-L(bkPxQx)
31186320Sbholler		.int L(bkP2QE)-L(bkPxQx)
31196320Sbholler		.int L(bkP3QE)-L(bkPxQx)
31206320Sbholler		.int L(bkP4QE)-L(bkPxQx)
31216320Sbholler		.int L(bkP5QE)-L(bkPxQx)
31226320Sbholler		.int L(bkP6QE)-L(bkPxQx)
31236320Sbholler		.int L(bkP7QE)-L(bkPxQx)
31246320Sbholler
31256320Sbholler		.int L(bkP0QF)-L(bkPxQx)
31266320Sbholler		.int L(bkP1QF)-L(bkPxQx)
31276320Sbholler		.int L(bkP2QF)-L(bkPxQx)
31286320Sbholler		.int L(bkP3QF)-L(bkPxQx)
31296320Sbholler		.int L(bkP4QF)-L(bkPxQx)
31306320Sbholler		.int L(bkP5QF)-L(bkPxQx)
31316320Sbholler		.int L(bkP6QF)-L(bkPxQx)
31326320Sbholler		.int L(bkP7QF)-L(bkPxQx)
31336320Sbholler
31346320Sbholler		.int L(bkP0QG)-L(bkPxQx)
31356320Sbholler		.int L(bkP1QG)-L(bkPxQx)
31366320Sbholler		.int L(bkP2QG)-L(bkPxQx)
31376320Sbholler		.int L(bkP3QG)-L(bkPxQx)
31386320Sbholler		.int L(bkP4QG)-L(bkPxQx)
31396320Sbholler		.int L(bkP5QG)-L(bkPxQx)
31406320Sbholler		.int L(bkP6QG)-L(bkPxQx)
31416320Sbholler		.int L(bkP7QG)-L(bkPxQx)
31426320Sbholler
31436320Sbholler		.int L(bkP0QH)-L(bkPxQx)
31446320Sbholler		.int L(bkP1QH)-L(bkPxQx)
31456320Sbholler		.int L(bkP2QH)-L(bkPxQx)
31466320Sbholler		.int L(bkP3QH)-L(bkPxQx)
31476320Sbholler		.int L(bkP4QH)-L(bkPxQx)
31486320Sbholler		.int L(bkP5QH)-L(bkPxQx)
31496320Sbholler		.int L(bkP6QH)-L(bkPxQx)
31506320Sbholler		.int L(bkP7QH)-L(bkPxQx)
31516320Sbholler
31526320Sbholler		.int L(bkP0QI)-L(bkPxQx)
31536320Sbholler		.int L(bkP1QI)-L(bkPxQx)
31546320Sbholler		.int L(bkP2QI)-L(bkPxQx)
31556320Sbholler		.int L(bkP3QI)-L(bkPxQx)
31566320Sbholler		.int L(bkP4QI)-L(bkPxQx)
31576320Sbholler		.int L(bkP5QI)-L(bkPxQx)
31586320Sbholler		.int L(bkP6QI)-L(bkPxQx)
31596320Sbholler		.int L(bkP7QI)-L(bkPxQx)
31606320Sbholler
31610Sstevel@tonic-gate	SET_SIZE(memmove)
3162