xref: /onnv-gate/usr/src/lib/libc/amd64/gen/memcpy.s (revision 10024:2213a466547f)
10Sstevel@tonic-gate/*
26320Sbholler * CDDL HEADER START
36320Sbholler *
46320Sbholler * The contents of this file are subject to the terms of the
56320Sbholler * Common Development and Distribution License (the "License").
66320Sbholler * You may not use this file except in compliance with the License.
76320Sbholler *
86320Sbholler * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
96320Sbholler * or http://www.opensolaris.org/os/licensing.
106320Sbholler * See the License for the specific language governing permissions
116320Sbholler * and limitations under the License.
126320Sbholler *
136320Sbholler * When distributing Covered Code, include this CDDL HEADER in each
146320Sbholler * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
156320Sbholler * If applicable, add the following below this CDDL HEADER, with the
166320Sbholler * fields enclosed by brackets "[]" replaced with your own identifying
176320Sbholler * information: Portions Copyright [yyyy] [name of copyright owner]
186320Sbholler *
196320Sbholler * CDDL HEADER END
200Sstevel@tonic-gate */
210Sstevel@tonic-gate
220Sstevel@tonic-gate/*
23*10024Sbostrovs * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
246812Sraf * Use is subject to license terms.
256812Sraf */
266812Sraf
276812Sraf/*
286320Sbholler * Copyright (c) 2008, Intel Corporation
290Sstevel@tonic-gate * All rights reserved.
300Sstevel@tonic-gate */
310Sstevel@tonic-gate
326320Sbholler/*
336320Sbholler * memcpy.s - copies two blocks of memory
346320Sbholler *	Implements memcpy() and memmove() libc primitives.
356320Sbholler */
366812Sraf
377298SMark.J.Nelson@Sun.COM	.file	"memcpy.s"
380Sstevel@tonic-gate
390Sstevel@tonic-gate#include <sys/asm_linkage.h>
406812Sraf
410Sstevel@tonic-gate	ANSI_PRAGMA_WEAK(memmove,function)
420Sstevel@tonic-gate	ANSI_PRAGMA_WEAK(memcpy,function)
430Sstevel@tonic-gate
440Sstevel@tonic-gate#include "cache.h"
456320Sbholler#include "proc64_id.h"
460Sstevel@tonic-gate
476320Sbholler#define L(s) .memcpy/**/s
486320Sbholler
496320Sbholler/*
506320Sbholler * memcpy algorithm overview:
516320Sbholler *
526320Sbholler * Thresholds used below were determined experimentally.
536320Sbholler *
546320Sbholler * Pseudo code:
556320Sbholler *
56*10024Sbostrovs * NOTE: On AMD NO_SSE is always set.  Performance on Opteron did not improve
57*10024Sbostrovs * using 16-byte stores.  Setting NO_SSE on AMD should be re-evaluated on
58*10024Sbostrovs * future AMD processors.
59*10024Sbostrovs *
60*10024Sbostrovs *
616320Sbholler * If (size <= 128 bytes) {
626320Sbholler *	do unrolled code (primarily 8-byte loads/stores) regardless of
636320Sbholler *	alignment.
646320Sbholler * } else {
656320Sbholler *	Align destination to 16-byte boundary
666320Sbholler *
676320Sbholler *      if (NO_SSE) {
686320Sbholler *		If (size > half of the largest level cache) {
696320Sbholler *			Use 8-byte non-temporal stores (64-bytes/loop)
706320Sbholler *		} else {
716320Sbholler *			if (size > 4K && size <= half l1 cache size) {
726320Sbholler *				Use rep movsq
736320Sbholler *			} else {
746320Sbholler *				Use 8-byte loads/stores (64 bytes per loop)
756320Sbholler *			}
766320Sbholler *		}
776320Sbholler *
786320Sbholler *	} else { **USE SSE**
796320Sbholler *		If (size > half of the largest level cache) {
806320Sbholler *			Use 16-byte non-temporal stores (128-bytes per loop)
816320Sbholler *		} else {
826320Sbholler *			If (both source and destination are aligned) {
836320Sbholler *			    Use 16-byte aligned loads and stores (128 bytes/loop)
846320Sbholler *			} else {
856320Sbholler *			    use pairs of xmm registers with SSE2 or SSSE3
866320Sbholler *			    instructions to concatenate and shift appropriately
876320Sbholler *			    to account for source unalignment. This enables
886320Sbholler *			    16-byte aligned loads to be done.
896320Sbholler *			}
906320Sbholler *		}
916320Sbholler	}
926320Sbholler *
936320Sbholler *	Finish any remaining bytes via unrolled code above.
946320Sbholler * }
956320Sbholler *
966320Sbholler * memmove overview:
976320Sbholler *	memmove is the same as memcpy except one case where copy needs to be
986320Sbholler *	done backwards. The copy backwards code is done in a similar manner.
996320Sbholler */
1006320Sbholler
1016320Sbholler	ENTRY(memmove)
1026320Sbholler	cmp	%rsi,%rdi		# if dst <= src
1036320Sbholler	jbe	L(CopyForward)		# then do copy forward
1046320Sbholler	mov	%rsi,%r9		# move src to r9
1056320Sbholler	add	%rdx,%r9		# add len to get addr of end of src
1066320Sbholler	cmp	%r9,%rdi		# if dst < end of src
1076320Sbholler	jb	L(CopyBackwards)	# then do copy backwards
1086320Sbholler	jmp	L(CopyForward)
1096320Sbholler
1106320Sbholler	ENTRY (memcpy)
1116320SbhollerL(CopyForward):
1126320Sbholler	mov    %rdx,%r8
1136320Sbholler	mov    %rdi,%rcx
1146320Sbholler	mov    %rsi,%rdx
1156320Sbholler	mov    %rdi,%rax
1166320Sbholler	lea    L(fwdPxQx)(%rip),%r11
1176320Sbholler	cmp    $0x80,%r8		# 128
1186320Sbholler	jg     L(ck_use_sse2)
1196320Sbholler	add    %r8,%rcx
1206320Sbholler	add    %r8,%rdx
1216320Sbholler
1226320Sbholler	movslq (%r11,%r8,4),%r10
1236320Sbholler	lea    (%r10,%r11,1),%r11
1246320Sbholler	jmpq   *%r11
1256320Sbholler
1266320Sbholler	.balign 16
1276320SbhollerL(ShrtAlignNew):
1286320Sbholler	lea    L(AliPxQx)(%rip),%r11
1296320Sbholler	mov    %rcx,%r9
1306320Sbholler	and    $0xf,%r9
1316320Sbholler
1326320Sbholler	movslq (%r11,%r9,4),%r10
1336320Sbholler	lea    (%r10,%r11,1),%r11
1346320Sbholler	jmpq   *%r11
1356320Sbholler
1366320Sbholler	.balign 16
1376320SbhollerL(fwdPxQx): .int       L(P0Q0)-L(fwdPxQx)
1386320Sbholler           .int        L(P1Q0)-L(fwdPxQx)
1396320Sbholler           .int        L(P2Q0)-L(fwdPxQx)
1406320Sbholler           .int        L(P3Q0)-L(fwdPxQx)
1416320Sbholler           .int        L(P4Q0)-L(fwdPxQx)
1426320Sbholler           .int        L(P5Q0)-L(fwdPxQx)
1436320Sbholler           .int        L(P6Q0)-L(fwdPxQx)
1446320Sbholler           .int        L(P7Q0)-L(fwdPxQx)
1456320Sbholler
1466320Sbholler           .int        L(P0Q1)-L(fwdPxQx)
1476320Sbholler           .int        L(P1Q1)-L(fwdPxQx)
1486320Sbholler           .int        L(P2Q1)-L(fwdPxQx)
1496320Sbholler           .int        L(P3Q1)-L(fwdPxQx)
1506320Sbholler           .int        L(P4Q1)-L(fwdPxQx)
1516320Sbholler           .int        L(P5Q1)-L(fwdPxQx)
1526320Sbholler           .int        L(P6Q1)-L(fwdPxQx)
1536320Sbholler           .int        L(P7Q1)-L(fwdPxQx)
1546320Sbholler
1556320Sbholler           .int        L(P0Q2)-L(fwdPxQx)
1566320Sbholler           .int        L(P1Q2)-L(fwdPxQx)
1576320Sbholler           .int        L(P2Q2)-L(fwdPxQx)
1586320Sbholler           .int        L(P3Q2)-L(fwdPxQx)
1596320Sbholler           .int        L(P4Q2)-L(fwdPxQx)
1606320Sbholler           .int        L(P5Q2)-L(fwdPxQx)
1616320Sbholler           .int        L(P6Q2)-L(fwdPxQx)
1626320Sbholler           .int        L(P7Q2)-L(fwdPxQx)
1636320Sbholler
1646320Sbholler           .int        L(P0Q3)-L(fwdPxQx)
1656320Sbholler           .int        L(P1Q3)-L(fwdPxQx)
1666320Sbholler           .int        L(P2Q3)-L(fwdPxQx)
1676320Sbholler           .int        L(P3Q3)-L(fwdPxQx)
1686320Sbholler           .int        L(P4Q3)-L(fwdPxQx)
1696320Sbholler           .int        L(P5Q3)-L(fwdPxQx)
1706320Sbholler           .int        L(P6Q3)-L(fwdPxQx)
1716320Sbholler           .int        L(P7Q3)-L(fwdPxQx)
1726320Sbholler
1736320Sbholler           .int        L(P0Q4)-L(fwdPxQx)
1746320Sbholler           .int        L(P1Q4)-L(fwdPxQx)
1756320Sbholler           .int        L(P2Q4)-L(fwdPxQx)
1766320Sbholler           .int        L(P3Q4)-L(fwdPxQx)
1776320Sbholler           .int        L(P4Q4)-L(fwdPxQx)
1786320Sbholler           .int        L(P5Q4)-L(fwdPxQx)
1796320Sbholler           .int        L(P6Q4)-L(fwdPxQx)
1806320Sbholler           .int        L(P7Q4)-L(fwdPxQx)
1816320Sbholler
1826320Sbholler           .int        L(P0Q5)-L(fwdPxQx)
1836320Sbholler           .int        L(P1Q5)-L(fwdPxQx)
1846320Sbholler           .int        L(P2Q5)-L(fwdPxQx)
1856320Sbholler           .int        L(P3Q5)-L(fwdPxQx)
1866320Sbholler           .int        L(P4Q5)-L(fwdPxQx)
1876320Sbholler           .int        L(P5Q5)-L(fwdPxQx)
1886320Sbholler           .int        L(P6Q5)-L(fwdPxQx)
1896320Sbholler           .int        L(P7Q5)-L(fwdPxQx)
1906320Sbholler
1916320Sbholler           .int        L(P0Q6)-L(fwdPxQx)
1926320Sbholler           .int        L(P1Q6)-L(fwdPxQx)
1936320Sbholler           .int        L(P2Q6)-L(fwdPxQx)
1946320Sbholler           .int        L(P3Q6)-L(fwdPxQx)
1956320Sbholler           .int        L(P4Q6)-L(fwdPxQx)
1966320Sbholler           .int        L(P5Q6)-L(fwdPxQx)
1976320Sbholler           .int        L(P6Q6)-L(fwdPxQx)
1986320Sbholler           .int        L(P7Q6)-L(fwdPxQx)
1996320Sbholler
2006320Sbholler           .int        L(P0Q7)-L(fwdPxQx)
2016320Sbholler           .int        L(P1Q7)-L(fwdPxQx)
2026320Sbholler           .int        L(P2Q7)-L(fwdPxQx)
2036320Sbholler           .int        L(P3Q7)-L(fwdPxQx)
2046320Sbholler           .int        L(P4Q7)-L(fwdPxQx)
2056320Sbholler           .int        L(P5Q7)-L(fwdPxQx)
2066320Sbholler           .int        L(P6Q7)-L(fwdPxQx)
2076320Sbholler           .int        L(P7Q7)-L(fwdPxQx)
2086320Sbholler
2096320Sbholler           .int        L(P0Q8)-L(fwdPxQx)
2106320Sbholler           .int        L(P1Q8)-L(fwdPxQx)
2116320Sbholler           .int        L(P2Q8)-L(fwdPxQx)
2126320Sbholler           .int        L(P3Q8)-L(fwdPxQx)
2136320Sbholler           .int        L(P4Q8)-L(fwdPxQx)
2146320Sbholler           .int        L(P5Q8)-L(fwdPxQx)
2156320Sbholler           .int        L(P6Q8)-L(fwdPxQx)
2166320Sbholler           .int        L(P7Q8)-L(fwdPxQx)
2176320Sbholler
2186320Sbholler           .int        L(P0Q9)-L(fwdPxQx)
2196320Sbholler           .int        L(P1Q9)-L(fwdPxQx)
2206320Sbholler           .int        L(P2Q9)-L(fwdPxQx)
2216320Sbholler           .int        L(P3Q9)-L(fwdPxQx)
2226320Sbholler           .int        L(P4Q9)-L(fwdPxQx)
2236320Sbholler           .int        L(P5Q9)-L(fwdPxQx)
2246320Sbholler           .int        L(P6Q9)-L(fwdPxQx)
2256320Sbholler           .int        L(P7Q9)-L(fwdPxQx)
2266320Sbholler
2276320Sbholler           .int        L(P0QA)-L(fwdPxQx)
2286320Sbholler           .int        L(P1QA)-L(fwdPxQx)
2296320Sbholler           .int        L(P2QA)-L(fwdPxQx)
2306320Sbholler           .int        L(P3QA)-L(fwdPxQx)
2316320Sbholler           .int        L(P4QA)-L(fwdPxQx)
2326320Sbholler           .int        L(P5QA)-L(fwdPxQx)
2336320Sbholler           .int        L(P6QA)-L(fwdPxQx)
2346320Sbholler           .int        L(P7QA)-L(fwdPxQx)
2356320Sbholler
2366320Sbholler           .int        L(P0QB)-L(fwdPxQx)
2376320Sbholler           .int        L(P1QB)-L(fwdPxQx)
2386320Sbholler           .int        L(P2QB)-L(fwdPxQx)
2396320Sbholler           .int        L(P3QB)-L(fwdPxQx)
2406320Sbholler           .int        L(P4QB)-L(fwdPxQx)
2416320Sbholler           .int        L(P5QB)-L(fwdPxQx)
2426320Sbholler           .int        L(P6QB)-L(fwdPxQx)
2436320Sbholler           .int        L(P7QB)-L(fwdPxQx)
2446320Sbholler
2456320Sbholler           .int        L(P0QC)-L(fwdPxQx)
2466320Sbholler           .int        L(P1QC)-L(fwdPxQx)
2476320Sbholler           .int        L(P2QC)-L(fwdPxQx)
2486320Sbholler           .int        L(P3QC)-L(fwdPxQx)
2496320Sbholler           .int        L(P4QC)-L(fwdPxQx)
2506320Sbholler           .int        L(P5QC)-L(fwdPxQx)
2516320Sbholler           .int        L(P6QC)-L(fwdPxQx)
2526320Sbholler           .int        L(P7QC)-L(fwdPxQx)
2536320Sbholler
2546320Sbholler           .int        L(P0QD)-L(fwdPxQx)
2556320Sbholler           .int        L(P1QD)-L(fwdPxQx)
2566320Sbholler           .int        L(P2QD)-L(fwdPxQx)
2576320Sbholler           .int        L(P3QD)-L(fwdPxQx)
2586320Sbholler           .int        L(P4QD)-L(fwdPxQx)
2596320Sbholler           .int        L(P5QD)-L(fwdPxQx)
2606320Sbholler           .int        L(P6QD)-L(fwdPxQx)
2616320Sbholler           .int        L(P7QD)-L(fwdPxQx)
2626320Sbholler
2636320Sbholler           .int        L(P0QE)-L(fwdPxQx)
2646320Sbholler           .int        L(P1QE)-L(fwdPxQx)
2656320Sbholler           .int        L(P2QE)-L(fwdPxQx)
2666320Sbholler           .int        L(P3QE)-L(fwdPxQx)
2676320Sbholler           .int        L(P4QE)-L(fwdPxQx)
2686320Sbholler           .int        L(P5QE)-L(fwdPxQx)
2696320Sbholler           .int        L(P6QE)-L(fwdPxQx)
2706320Sbholler           .int        L(P7QE)-L(fwdPxQx)
2716320Sbholler
2726320Sbholler           .int        L(P0QF)-L(fwdPxQx)
2736320Sbholler           .int        L(P1QF)-L(fwdPxQx)
2746320Sbholler           .int        L(P2QF)-L(fwdPxQx)
2756320Sbholler           .int        L(P3QF)-L(fwdPxQx)
2766320Sbholler           .int        L(P4QF)-L(fwdPxQx)
2776320Sbholler           .int        L(P5QF)-L(fwdPxQx)
2786320Sbholler           .int        L(P6QF)-L(fwdPxQx)
2796320Sbholler           .int        L(P7QF)-L(fwdPxQx)
2806320Sbholler
2816320Sbholler           .int        L(P0QG)-L(fwdPxQx)	# 0x80
2826320Sbholler
2836320Sbholler	   .balign 16
2846320SbhollerL(AliPxQx): .int   L(now_qw_aligned)-L(AliPxQx)
2856320Sbholler           .int        L(A1Q0)-L(AliPxQx)
2866320Sbholler           .int        L(A2Q0)-L(AliPxQx)
2876320Sbholler           .int        L(A3Q0)-L(AliPxQx)
2886320Sbholler           .int        L(A4Q0)-L(AliPxQx)
2896320Sbholler           .int        L(A5Q0)-L(AliPxQx)
2906320Sbholler           .int        L(A6Q0)-L(AliPxQx)
2916320Sbholler           .int        L(A7Q0)-L(AliPxQx)
2926320Sbholler           .int        L(A0Q1)-L(AliPxQx)
2936320Sbholler           .int        L(A1Q1)-L(AliPxQx)
2946320Sbholler           .int        L(A2Q1)-L(AliPxQx)
2956320Sbholler           .int        L(A3Q1)-L(AliPxQx)
2966320Sbholler           .int        L(A4Q1)-L(AliPxQx)
2976320Sbholler           .int        L(A5Q1)-L(AliPxQx)
2986320Sbholler           .int        L(A6Q1)-L(AliPxQx)
2996320Sbholler           .int        L(A7Q1)-L(AliPxQx)
3006320Sbholler
3016320Sbholler	.balign 16
3026320SbhollerL(A1Q0):			# ; need to move 8+ 7=1+2+4 bytes
3036320Sbholler	movzbq (%rdx),%r11
3046320Sbholler	sub    $0xf,%r8
3056320Sbholler	mov    %r11b,(%rcx)
3066320Sbholler
3076320Sbholler	movzwq 0x1(%rdx),%r10
3086320Sbholler	mov    %r10w,0x1(%rcx)
3096320Sbholler
3106320Sbholler	mov    0x3(%rdx),%r9d
3116320Sbholler	mov    %r9d,0x3(%rcx)
3126320Sbholler
3136320Sbholler	mov    0x7(%rdx),%r11
3146320Sbholler	add    $0xf,%rdx
3156320Sbholler	mov    %r11,0x7(%rcx)
3166320Sbholler
3176320Sbholler	add    $0xf,%rcx
3186320Sbholler	jmp    L(now_qw_aligned)
3196320Sbholler
3206320Sbholler	.balign 16
3216320SbhollerL(A2Q0):			# ; need to move 8+ 6=2+4 bytes
3226320Sbholler	movzwq (%rdx),%r10
3236320Sbholler	sub    $0xe,%r8
3246320Sbholler	mov    %r10w,(%rcx)
3256320Sbholler
3266320Sbholler	mov    0x2(%rdx),%r9d
3276320Sbholler	mov    %r9d,0x2(%rcx)
3286320Sbholler
3296320Sbholler	mov    0x6(%rdx),%r11
3306320Sbholler	add    $0xe,%rdx
3316320Sbholler	mov    %r11,0x6(%rcx)
3326320Sbholler	add    $0xe,%rcx
3336320Sbholler	jmp    L(now_qw_aligned)
3346320Sbholler
3356320Sbholler	.balign 16
3366320SbhollerL(A3Q0):			# ; need to move 8+ 5=1+4 bytes
3376320Sbholler	movzbq (%rdx),%r11
3386320Sbholler	sub    $0xd,%r8
3396320Sbholler	mov    %r11b,(%rcx)
3406320Sbholler
3416320Sbholler	mov    0x1(%rdx),%r9d
3426320Sbholler	mov    %r9d,0x1(%rcx)
3436320Sbholler
3446320Sbholler	mov    0x5(%rdx),%r10
3456320Sbholler	add    $0xd,%rdx
3466320Sbholler	mov    %r10,0x5(%rcx)
3476320Sbholler
3486320Sbholler	add    $0xd,%rcx
3496320Sbholler	jmp    L(now_qw_aligned)
3506320Sbholler
3516320Sbholler	.balign 16
3526320SbhollerL(A4Q0):			# ; need to move 8+4 bytes
3536320Sbholler	mov    (%rdx),%r9d
3546320Sbholler	sub    $0xc,%r8
3556320Sbholler	mov    %r9d,(%rcx)
3566320Sbholler
3576320Sbholler	mov    0x4(%rdx),%r10
3586320Sbholler	add    $0xc,%rdx
3596320Sbholler	mov    %r10,0x4(%rcx)
3606320Sbholler
3616320Sbholler	add    $0xc,%rcx
3626320Sbholler	jmp    L(now_qw_aligned)
3636320Sbholler
3646320Sbholler	.balign 16
3656320SbhollerL(A5Q0):			# ; need to move 8+ 3=1+2 bytes
3666320Sbholler	movzbq (%rdx),%r11
3676320Sbholler	sub    $0xb,%r8
3686320Sbholler	mov    %r11b,(%rcx)
3696320Sbholler
3706320Sbholler	movzwq 0x1(%rdx),%r10
3716320Sbholler	mov    %r10w,0x1(%rcx)
3726320Sbholler
3736320Sbholler	mov    0x3(%rdx),%r9
3746320Sbholler	add    $0xb,%rdx
3756320Sbholler	mov    %r9,0x3(%rcx)
3766320Sbholler
3776320Sbholler	add    $0xb,%rcx
3786320Sbholler	jmp    L(now_qw_aligned)
3796320Sbholler
3806320Sbholler	.balign 16
3816320SbhollerL(A6Q0):			# ; need to move 8+2 bytes
3826320Sbholler	movzwq (%rdx),%r10
3836320Sbholler	sub    $0xa,%r8
3846320Sbholler	mov    %r10w,(%rcx)
3856320Sbholler
3866320Sbholler	mov    0x2(%rdx),%r9
3876320Sbholler	add    $0xa,%rdx
3886320Sbholler	mov    %r9,0x2(%rcx)
3896320Sbholler
3906320Sbholler	add    $0xa,%rcx
3916320Sbholler	jmp    L(now_qw_aligned)
3926320Sbholler
3936320Sbholler	.balign 16
3946320SbhollerL(A7Q0):			# ; need to move 8+1 byte
3956320Sbholler	movzbq (%rdx),%r11
3966320Sbholler	sub    $0x9,%r8
3976320Sbholler	mov    %r11b,(%rcx)
3986320Sbholler
3996320Sbholler	mov    0x1(%rdx),%r10
4006320Sbholler	add    $0x9,%rdx
4016320Sbholler	mov    %r10,0x1(%rcx)
4026320Sbholler
4036320Sbholler	add    $0x9,%rcx
4046320Sbholler	jmp    L(now_qw_aligned)
4056320Sbholler
4066320Sbholler	.balign 16
4076320SbhollerL(A0Q1):			# ; need to move 8 bytes
4086320Sbholler
4096320Sbholler	mov    (%rdx),%r10
4106320Sbholler	add    $0x8,%rdx
4116320Sbholler	sub    $0x8,%r8
4126320Sbholler	mov    %r10,(%rcx)
4136320Sbholler
4146320Sbholler	add    $0x8,%rcx
4156320Sbholler	jmp    L(now_qw_aligned)
4166320Sbholler
4176320Sbholler	.balign 16
4186320SbhollerL(A1Q1):			# ; need to move 7=1+2+4 bytes
4196320Sbholler	movzbq (%rdx),%r11
4206320Sbholler	sub    $0x7,%r8
4216320Sbholler	mov    %r11b,(%rcx)
4226320Sbholler
4236320Sbholler	movzwq 0x1(%rdx),%r10
4246320Sbholler	mov    %r10w,0x1(%rcx)
4256320Sbholler
4266320Sbholler	mov    0x3(%rdx),%r9d
4276320Sbholler	add    $0x7,%rdx
4286320Sbholler	mov    %r9d,0x3(%rcx)
4296320Sbholler	add    $0x7,%rcx
4306320Sbholler	jmp    L(now_qw_aligned)
4316320Sbholler
4326320Sbholler	.balign 16
4336320SbhollerL(A2Q1):			# ; need to move 6=2+4 bytes
4346320Sbholler	movzwq (%rdx),%r10
4356320Sbholler	sub    $0x6,%r8
4366320Sbholler	mov    %r10w,(%rcx)
4376320Sbholler	mov    0x2(%rdx),%r9d
4386320Sbholler	add    $0x6,%rdx
4396320Sbholler	mov    %r9d,0x2(%rcx)
4406320Sbholler	add    $0x6,%rcx
4416320Sbholler	jmp    L(now_qw_aligned)
4426320Sbholler
4436320Sbholler	.balign 16
4446320SbhollerL(A3Q1):			# ; need to move 5=1+4 bytes
4456320Sbholler	movzbq (%rdx),%r11
4466320Sbholler	sub    $0x5,%r8
4476320Sbholler	mov    %r11b,(%rcx)
4486320Sbholler	mov    0x1(%rdx),%r9d
4496320Sbholler	add    $0x5,%rdx
4506320Sbholler	mov    %r9d,0x1(%rcx)
4516320Sbholler	add    $0x5,%rcx
4526320Sbholler	jmp    L(now_qw_aligned)
4536320Sbholler
4546320Sbholler	.balign 16
4556320SbhollerL(A4Q1):			# ; need to move 4 bytes
4566320Sbholler	mov    (%rdx),%r9d
4576320Sbholler	sub    $0x4,%r8
4586320Sbholler	add    $0x4,%rdx
4596320Sbholler	mov    %r9d,(%rcx)
4606320Sbholler	add    $0x4,%rcx
4616320Sbholler	jmp    L(now_qw_aligned)
4626320Sbholler
4636320Sbholler	.balign 16
4646320SbhollerL(A5Q1):			# ; need to move 3=1+2 bytes
4656320Sbholler	movzbq (%rdx),%r11
4666320Sbholler	sub    $0x3,%r8
4676320Sbholler	mov    %r11b,(%rcx)
4686320Sbholler
4696320Sbholler	movzwq 0x1(%rdx),%r10
4706320Sbholler	add    $0x3,%rdx
4716320Sbholler	mov    %r10w,0x1(%rcx)
4726320Sbholler
4736320Sbholler	add    $0x3,%rcx
4746320Sbholler	jmp    L(now_qw_aligned)
4756320Sbholler
4766320Sbholler	.balign 16
4776320SbhollerL(A6Q1):			# ; need to move 2 bytes
4786320Sbholler	movzwq (%rdx),%r10
4796320Sbholler	sub    $0x2,%r8
4806320Sbholler	add    $0x2,%rdx
4816320Sbholler	mov    %r10w,(%rcx)
4826320Sbholler	add    $0x2,%rcx
4836320Sbholler	jmp    L(now_qw_aligned)
4846320Sbholler
4856320Sbholler	.balign 16
4866320SbhollerL(A7Q1):			# ; need to move 1 byte
4876320Sbholler	movzbq (%rdx),%r11
4886320Sbholler	dec    %r8
4896320Sbholler	inc    %rdx
4906320Sbholler	mov    %r11b,(%rcx)
4916320Sbholler	inc    %rcx
4926320Sbholler	jmp    L(now_qw_aligned)
4936320Sbholler
4946320Sbholler
4956320Sbholler	.balign 16
4966320SbhollerL(P0QG):
4976320Sbholler	mov    -0x80(%rdx),%r9
4986320Sbholler	mov    %r9,-0x80(%rcx)
4996320SbhollerL(P0QF):
5006320Sbholler	mov    -0x78(%rdx),%r10
5016320Sbholler	mov    %r10,-0x78(%rcx)
5026320SbhollerL(P0QE):
5036320Sbholler	mov    -0x70(%rdx),%r9
5046320Sbholler	mov    %r9,-0x70(%rcx)
5056320SbhollerL(P0QD):
5066320Sbholler	mov    -0x68(%rdx),%r10
5076320Sbholler	mov    %r10,-0x68(%rcx)
5086320SbhollerL(P0QC):
5096320Sbholler	mov    -0x60(%rdx),%r9
5106320Sbholler	mov    %r9,-0x60(%rcx)
5116320SbhollerL(P0QB):
5126320Sbholler	mov    -0x58(%rdx),%r10
5136320Sbholler	mov    %r10,-0x58(%rcx)
5146320SbhollerL(P0QA):
5156320Sbholler	mov    -0x50(%rdx),%r9
5166320Sbholler	mov    %r9,-0x50(%rcx)
5176320SbhollerL(P0Q9):
5186320Sbholler	mov    -0x48(%rdx),%r10
5196320Sbholler	mov    %r10,-0x48(%rcx)
5206320SbhollerL(P0Q8):
5216320Sbholler	mov    -0x40(%rdx),%r9
5226320Sbholler	mov    %r9,-0x40(%rcx)
5236320SbhollerL(P0Q7):
5246320Sbholler	mov    -0x38(%rdx),%r10
5256320Sbholler	mov    %r10,-0x38(%rcx)
5266320SbhollerL(P0Q6):
5276320Sbholler	mov    -0x30(%rdx),%r9
5286320Sbholler	mov    %r9,-0x30(%rcx)
5296320SbhollerL(P0Q5):
5306320Sbholler	mov    -0x28(%rdx),%r10
5316320Sbholler	mov    %r10,-0x28(%rcx)
5326320SbhollerL(P0Q4):
5336320Sbholler	mov    -0x20(%rdx),%r9
5346320Sbholler	mov    %r9,-0x20(%rcx)
5356320SbhollerL(P0Q3):
5366320Sbholler	mov    -0x18(%rdx),%r10
5376320Sbholler	mov    %r10,-0x18(%rcx)
5386320SbhollerL(P0Q2):
5396320Sbholler	mov    -0x10(%rdx),%r9
5406320Sbholler	mov    %r9,-0x10(%rcx)
5416320SbhollerL(P0Q1):
5426320Sbholler	mov    -0x8(%rdx),%r10
5436320Sbholler	mov    %r10,-0x8(%rcx)
5446320SbhollerL(P0Q0):
5456320Sbholler	ret
5466320Sbholler
5476320Sbholler	.balign 16
5486320SbhollerL(P1QF):
5496320Sbholler	mov    -0x79(%rdx),%r9
5506320Sbholler	mov    %r9,-0x79(%rcx)
5516320SbhollerL(P1QE):
5526320Sbholler	mov    -0x71(%rdx),%r11
5536320Sbholler	mov    %r11,-0x71(%rcx)
5546320SbhollerL(P1QD):
5556320Sbholler	mov    -0x69(%rdx),%r10
5566320Sbholler	mov    %r10,-0x69(%rcx)
5576320SbhollerL(P1QC):
5586320Sbholler	mov    -0x61(%rdx),%r9
5596320Sbholler	mov    %r9,-0x61(%rcx)
5606320SbhollerL(P1QB):
5616320Sbholler	mov    -0x59(%rdx),%r11
5626320Sbholler	mov    %r11,-0x59(%rcx)
5636320SbhollerL(P1QA):
5646320Sbholler	mov    -0x51(%rdx),%r10
5656320Sbholler	mov    %r10,-0x51(%rcx)
5666320SbhollerL(P1Q9):
5676320Sbholler	mov    -0x49(%rdx),%r9
5686320Sbholler	mov    %r9,-0x49(%rcx)
5696320SbhollerL(P1Q8):
5706320Sbholler	mov    -0x41(%rdx),%r11
5716320Sbholler	mov    %r11,-0x41(%rcx)
5726320SbhollerL(P1Q7):
5736320Sbholler	mov    -0x39(%rdx),%r10
5746320Sbholler	mov    %r10,-0x39(%rcx)
5756320SbhollerL(P1Q6):
5766320Sbholler	mov    -0x31(%rdx),%r9
5776320Sbholler	mov    %r9,-0x31(%rcx)
5786320SbhollerL(P1Q5):
5796320Sbholler	mov    -0x29(%rdx),%r11
5806320Sbholler	mov    %r11,-0x29(%rcx)
5816320SbhollerL(P1Q4):
5826320Sbholler	mov    -0x21(%rdx),%r10
5836320Sbholler	mov    %r10,-0x21(%rcx)
5846320SbhollerL(P1Q3):
5856320Sbholler	mov    -0x19(%rdx),%r9
5866320Sbholler	mov    %r9,-0x19(%rcx)
5876320SbhollerL(P1Q2):
5886320Sbholler	mov    -0x11(%rdx),%r11
5896320Sbholler	mov    %r11,-0x11(%rcx)
5906320SbhollerL(P1Q1):
5916320Sbholler	mov    -0x9(%rdx),%r10
5926320Sbholler	mov    %r10,-0x9(%rcx)
5936320SbhollerL(P1Q0):
5946320Sbholler	movzbq -0x1(%rdx),%r9
5956320Sbholler	mov    %r9b,-0x1(%rcx)
5966320Sbholler	ret
5976320Sbholler
5986320Sbholler	.balign 16
5996320SbhollerL(P2QF):
6006320Sbholler	mov    -0x7a(%rdx),%r9
6016320Sbholler	mov    %r9,-0x7a(%rcx)
6026320SbhollerL(P2QE):
6036320Sbholler	mov    -0x72(%rdx),%r11
6046320Sbholler	mov    %r11,-0x72(%rcx)
6056320SbhollerL(P2QD):
6066320Sbholler	mov    -0x6a(%rdx),%r10
6076320Sbholler	mov    %r10,-0x6a(%rcx)
6086320SbhollerL(P2QC):
6096320Sbholler	mov    -0x62(%rdx),%r9
6106320Sbholler	mov    %r9,-0x62(%rcx)
6116320SbhollerL(P2QB):
6126320Sbholler	mov    -0x5a(%rdx),%r11
6136320Sbholler	mov    %r11,-0x5a(%rcx)
6146320SbhollerL(P2QA):
6156320Sbholler	mov    -0x52(%rdx),%r10
6166320Sbholler	mov    %r10,-0x52(%rcx)
6176320SbhollerL(P2Q9):
6186320Sbholler	mov    -0x4a(%rdx),%r9
6196320Sbholler	mov    %r9,-0x4a(%rcx)
6206320SbhollerL(P2Q8):
6216320Sbholler	mov    -0x42(%rdx),%r11
6226320Sbholler	mov    %r11,-0x42(%rcx)
6236320SbhollerL(P2Q7):
6246320Sbholler	mov    -0x3a(%rdx),%r10
6256320Sbholler	mov    %r10,-0x3a(%rcx)
6266320SbhollerL(P2Q6):
6276320Sbholler	mov    -0x32(%rdx),%r9
6286320Sbholler	mov    %r9,-0x32(%rcx)
6296320SbhollerL(P2Q5):
6306320Sbholler	mov    -0x2a(%rdx),%r11
6316320Sbholler	mov    %r11,-0x2a(%rcx)
6326320SbhollerL(P2Q4):
6336320Sbholler	mov    -0x22(%rdx),%r10
6346320Sbholler	mov    %r10,-0x22(%rcx)
6356320SbhollerL(P2Q3):
6366320Sbholler	mov    -0x1a(%rdx),%r9
6376320Sbholler	mov    %r9,-0x1a(%rcx)
6386320SbhollerL(P2Q2):
6396320Sbholler	mov    -0x12(%rdx),%r11
6406320Sbholler	mov    %r11,-0x12(%rcx)
6416320SbhollerL(P2Q1):
6426320Sbholler	mov    -0xa(%rdx),%r10
6436320Sbholler	mov    %r10,-0xa(%rcx)
6446320SbhollerL(P2Q0):
6456320Sbholler	movzwq -0x2(%rdx),%r9
6466320Sbholler	mov    %r9w,-0x2(%rcx)
6476320Sbholler	ret
6486320Sbholler
6496320Sbholler	.balign 16
6506320SbhollerL(P3QF):
6516320Sbholler	mov    -0x7b(%rdx),%r9
6526320Sbholler	mov    %r9,-0x7b(%rcx)
6536320SbhollerL(P3QE):
6546320Sbholler	mov    -0x73(%rdx),%r11
6556320Sbholler	mov    %r11,-0x73(%rcx)
6566320SbhollerL(P3QD):
6576320Sbholler	mov    -0x6b(%rdx),%r10
6586320Sbholler	mov    %r10,-0x6b(%rcx)
6596320SbhollerL(P3QC):
6606320Sbholler	mov    -0x63(%rdx),%r9
6616320Sbholler	mov    %r9,-0x63(%rcx)
6626320SbhollerL(P3QB):
6636320Sbholler	mov    -0x5b(%rdx),%r11
6646320Sbholler	mov    %r11,-0x5b(%rcx)
6656320SbhollerL(P3QA):
6666320Sbholler	mov    -0x53(%rdx),%r10
6676320Sbholler	mov    %r10,-0x53(%rcx)
6686320SbhollerL(P3Q9):
6696320Sbholler	mov    -0x4b(%rdx),%r9
6706320Sbholler	mov    %r9,-0x4b(%rcx)
6716320SbhollerL(P3Q8):
6726320Sbholler	mov    -0x43(%rdx),%r11
6736320Sbholler	mov    %r11,-0x43(%rcx)
6746320SbhollerL(P3Q7):
6756320Sbholler	mov    -0x3b(%rdx),%r10
6766320Sbholler	mov    %r10,-0x3b(%rcx)
6776320SbhollerL(P3Q6):
6786320Sbholler	mov    -0x33(%rdx),%r9
6796320Sbholler	mov    %r9,-0x33(%rcx)
6806320SbhollerL(P3Q5):
6816320Sbholler	mov    -0x2b(%rdx),%r11
6826320Sbholler	mov    %r11,-0x2b(%rcx)
6836320SbhollerL(P3Q4):
6846320Sbholler	mov    -0x23(%rdx),%r10
6856320Sbholler	mov    %r10,-0x23(%rcx)
6866320SbhollerL(P3Q3):
6876320Sbholler	mov    -0x1b(%rdx),%r9
6886320Sbholler	mov    %r9,-0x1b(%rcx)
6896320SbhollerL(P3Q2):
6906320Sbholler	mov    -0x13(%rdx),%r11
6916320Sbholler	mov    %r11,-0x13(%rcx)
6926320SbhollerL(P3Q1):
6936320Sbholler	mov    -0xb(%rdx),%r10
6946320Sbholler	mov    %r10,-0xb(%rcx)
6956320Sbholler	/*
6966320Sbholler	 * These trailing loads/stores have to do all their loads 1st,
6976320Sbholler	 * then do the stores.
6986320Sbholler	 */
6996320SbhollerL(P3Q0):
7006320Sbholler	movzwq -0x3(%rdx),%r9
7016320Sbholler	movzbq -0x1(%rdx),%r10
7026320Sbholler	mov    %r9w,-0x3(%rcx)
7036320Sbholler	mov    %r10b,-0x1(%rcx)
7046320Sbholler	ret
7056320Sbholler
7066320Sbholler	.balign 16
7076320SbhollerL(P4QF):
7086320Sbholler	mov    -0x7c(%rdx),%r9
7096320Sbholler	mov    %r9,-0x7c(%rcx)
7106320SbhollerL(P4QE):
7116320Sbholler	mov    -0x74(%rdx),%r11
7126320Sbholler	mov    %r11,-0x74(%rcx)
7136320SbhollerL(P4QD):
7146320Sbholler	mov    -0x6c(%rdx),%r10
7156320Sbholler	mov    %r10,-0x6c(%rcx)
7166320SbhollerL(P4QC):
7176320Sbholler	mov    -0x64(%rdx),%r9
7186320Sbholler	mov    %r9,-0x64(%rcx)
7196320SbhollerL(P4QB):
7206320Sbholler	mov    -0x5c(%rdx),%r11
7216320Sbholler	mov    %r11,-0x5c(%rcx)
7226320SbhollerL(P4QA):
7236320Sbholler	mov    -0x54(%rdx),%r10
7246320Sbholler	mov    %r10,-0x54(%rcx)
7256320SbhollerL(P4Q9):
7266320Sbholler	mov    -0x4c(%rdx),%r9
7276320Sbholler	mov    %r9,-0x4c(%rcx)
7286320SbhollerL(P4Q8):
7296320Sbholler	mov    -0x44(%rdx),%r11
7306320Sbholler	mov    %r11,-0x44(%rcx)
7316320SbhollerL(P4Q7):
7326320Sbholler	mov    -0x3c(%rdx),%r10
7336320Sbholler	mov    %r10,-0x3c(%rcx)
7346320SbhollerL(P4Q6):
7356320Sbholler	mov    -0x34(%rdx),%r9
7366320Sbholler	mov    %r9,-0x34(%rcx)
7376320SbhollerL(P4Q5):
7386320Sbholler	mov    -0x2c(%rdx),%r11
7396320Sbholler	mov    %r11,-0x2c(%rcx)
7406320SbhollerL(P4Q4):
7416320Sbholler	mov    -0x24(%rdx),%r10
7426320Sbholler	mov    %r10,-0x24(%rcx)
7436320SbhollerL(P4Q3):
7446320Sbholler	mov    -0x1c(%rdx),%r9
7456320Sbholler	mov    %r9,-0x1c(%rcx)
7466320SbhollerL(P4Q2):
7476320Sbholler	mov    -0x14(%rdx),%r11
7486320Sbholler	mov    %r11,-0x14(%rcx)
7496320SbhollerL(P4Q1):
7506320Sbholler	mov    -0xc(%rdx),%r10
7516320Sbholler	mov    %r10,-0xc(%rcx)
7526320SbhollerL(P4Q0):
7536320Sbholler	mov    -0x4(%rdx),%r9d
7546320Sbholler	mov    %r9d,-0x4(%rcx)
7556320Sbholler	ret
7566320Sbholler
7576320Sbholler	.balign 16
7586320SbhollerL(P5QF):
7596320Sbholler	mov    -0x7d(%rdx),%r9
7606320Sbholler	mov    %r9,-0x7d(%rcx)
7616320SbhollerL(P5QE):
7626320Sbholler	mov    -0x75(%rdx),%r11
7636320Sbholler	mov    %r11,-0x75(%rcx)
7646320SbhollerL(P5QD):
7656320Sbholler	mov    -0x6d(%rdx),%r10
7666320Sbholler	mov    %r10,-0x6d(%rcx)
7676320SbhollerL(P5QC):
7686320Sbholler	mov    -0x65(%rdx),%r9
7696320Sbholler	mov    %r9,-0x65(%rcx)
7706320SbhollerL(P5QB):
7716320Sbholler	mov    -0x5d(%rdx),%r11
7726320Sbholler	mov    %r11,-0x5d(%rcx)
7736320SbhollerL(P5QA):
7746320Sbholler	mov    -0x55(%rdx),%r10
7756320Sbholler	mov    %r10,-0x55(%rcx)
7766320SbhollerL(P5Q9):
7776320Sbholler	mov    -0x4d(%rdx),%r9
7786320Sbholler	mov    %r9,-0x4d(%rcx)
7796320SbhollerL(P5Q8):
7806320Sbholler	mov    -0x45(%rdx),%r11
7816320Sbholler	mov    %r11,-0x45(%rcx)
7826320SbhollerL(P5Q7):
7836320Sbholler	mov    -0x3d(%rdx),%r10
7846320Sbholler	mov    %r10,-0x3d(%rcx)
7856320SbhollerL(P5Q6):
7866320Sbholler	mov    -0x35(%rdx),%r9
7876320Sbholler	mov    %r9,-0x35(%rcx)
7886320SbhollerL(P5Q5):
7896320Sbholler	mov    -0x2d(%rdx),%r11
7906320Sbholler	mov    %r11,-0x2d(%rcx)
7916320SbhollerL(P5Q4):
7926320Sbholler	mov    -0x25(%rdx),%r10
7936320Sbholler	mov    %r10,-0x25(%rcx)
7946320SbhollerL(P5Q3):
7956320Sbholler	mov    -0x1d(%rdx),%r9
7966320Sbholler	mov    %r9,-0x1d(%rcx)
7976320SbhollerL(P5Q2):
7986320Sbholler	mov    -0x15(%rdx),%r11
7996320Sbholler	mov    %r11,-0x15(%rcx)
8006320SbhollerL(P5Q1):
8016320Sbholler	mov    -0xd(%rdx),%r10
8026320Sbholler	mov    %r10,-0xd(%rcx)
8036320Sbholler	/*
8046320Sbholler	 * These trailing loads/stores have to do all their loads 1st,
8056320Sbholler	 * then do the stores.
8066320Sbholler	 */
8076320SbhollerL(P5Q0):
8086320Sbholler	mov    -0x5(%rdx),%r9d
8096320Sbholler	movzbq -0x1(%rdx),%r10
8106320Sbholler	mov    %r9d,-0x5(%rcx)
8116320Sbholler	mov    %r10b,-0x1(%rcx)
8126320Sbholler	ret
8136320Sbholler
8146320Sbholler	.balign 16
8156320SbhollerL(P6QF):
8166320Sbholler	mov    -0x7e(%rdx),%r9
8176320Sbholler	mov    %r9,-0x7e(%rcx)
8186320SbhollerL(P6QE):
8196320Sbholler	mov    -0x76(%rdx),%r11
8206320Sbholler	mov    %r11,-0x76(%rcx)
8216320SbhollerL(P6QD):
8226320Sbholler	mov    -0x6e(%rdx),%r10
8236320Sbholler	mov    %r10,-0x6e(%rcx)
8246320SbhollerL(P6QC):
8256320Sbholler	mov    -0x66(%rdx),%r9
8266320Sbholler	mov    %r9,-0x66(%rcx)
8276320SbhollerL(P6QB):
8286320Sbholler	mov    -0x5e(%rdx),%r11
8296320Sbholler	mov    %r11,-0x5e(%rcx)
8306320SbhollerL(P6QA):
8316320Sbholler	mov    -0x56(%rdx),%r10
8326320Sbholler	mov    %r10,-0x56(%rcx)
8336320SbhollerL(P6Q9):
8346320Sbholler	mov    -0x4e(%rdx),%r9
8356320Sbholler	mov    %r9,-0x4e(%rcx)
8366320SbhollerL(P6Q8):
8376320Sbholler	mov    -0x46(%rdx),%r11
8386320Sbholler	mov    %r11,-0x46(%rcx)
8396320SbhollerL(P6Q7):
8406320Sbholler	mov    -0x3e(%rdx),%r10
8416320Sbholler	mov    %r10,-0x3e(%rcx)
8426320SbhollerL(P6Q6):
8436320Sbholler	mov    -0x36(%rdx),%r9
8446320Sbholler	mov    %r9,-0x36(%rcx)
8456320SbhollerL(P6Q5):
8466320Sbholler	mov    -0x2e(%rdx),%r11
8476320Sbholler	mov    %r11,-0x2e(%rcx)
8486320SbhollerL(P6Q4):
8496320Sbholler	mov    -0x26(%rdx),%r10
8506320Sbholler	mov    %r10,-0x26(%rcx)
8516320SbhollerL(P6Q3):
8526320Sbholler	mov    -0x1e(%rdx),%r9
8536320Sbholler	mov    %r9,-0x1e(%rcx)
8546320SbhollerL(P6Q2):
8556320Sbholler	mov    -0x16(%rdx),%r11
8566320Sbholler	mov    %r11,-0x16(%rcx)
8576320SbhollerL(P6Q1):
8586320Sbholler	mov    -0xe(%rdx),%r10
8596320Sbholler	mov    %r10,-0xe(%rcx)
8606320Sbholler	/*
8616320Sbholler	 * These trailing loads/stores have to do all their loads 1st,
8626320Sbholler	 * then do the stores.
8636320Sbholler	 */
8646320SbhollerL(P6Q0):
8656320Sbholler	mov    -0x6(%rdx),%r9d
8666320Sbholler	movzwq -0x2(%rdx),%r10
8676320Sbholler	mov    %r9d,-0x6(%rcx)
8686320Sbholler	mov    %r10w,-0x2(%rcx)
8696320Sbholler	ret
8706320Sbholler
8716320Sbholler	.balign 16
8726320SbhollerL(P7QF):
8736320Sbholler	mov    -0x7f(%rdx),%r9
8746320Sbholler	mov    %r9,-0x7f(%rcx)
8756320SbhollerL(P7QE):
8766320Sbholler	mov    -0x77(%rdx),%r11
8776320Sbholler	mov    %r11,-0x77(%rcx)
8786320SbhollerL(P7QD):
8796320Sbholler	mov    -0x6f(%rdx),%r10
8806320Sbholler	mov    %r10,-0x6f(%rcx)
8816320SbhollerL(P7QC):
8826320Sbholler	mov    -0x67(%rdx),%r9
8836320Sbholler	mov    %r9,-0x67(%rcx)
8846320SbhollerL(P7QB):
8856320Sbholler	mov    -0x5f(%rdx),%r11
8866320Sbholler	mov    %r11,-0x5f(%rcx)
8876320SbhollerL(P7QA):
8886320Sbholler	mov    -0x57(%rdx),%r10
8896320Sbholler	mov    %r10,-0x57(%rcx)
8906320SbhollerL(P7Q9):
8916320Sbholler	mov    -0x4f(%rdx),%r9
8926320Sbholler	mov    %r9,-0x4f(%rcx)
8936320SbhollerL(P7Q8):
8946320Sbholler	mov    -0x47(%rdx),%r11
8956320Sbholler	mov    %r11,-0x47(%rcx)
8966320SbhollerL(P7Q7):
8976320Sbholler	mov    -0x3f(%rdx),%r10
8986320Sbholler	mov    %r10,-0x3f(%rcx)
8996320SbhollerL(P7Q6):
9006320Sbholler	mov    -0x37(%rdx),%r9
9016320Sbholler	mov    %r9,-0x37(%rcx)
9026320SbhollerL(P7Q5):
9036320Sbholler	mov    -0x2f(%rdx),%r11
9046320Sbholler	mov    %r11,-0x2f(%rcx)
9056320SbhollerL(P7Q4):
9066320Sbholler	mov    -0x27(%rdx),%r10
9076320Sbholler	mov    %r10,-0x27(%rcx)
9086320SbhollerL(P7Q3):
9096320Sbholler	mov    -0x1f(%rdx),%r9
9106320Sbholler	mov    %r9,-0x1f(%rcx)
9116320SbhollerL(P7Q2):
9126320Sbholler	mov    -0x17(%rdx),%r11
9136320Sbholler	mov    %r11,-0x17(%rcx)
9146320SbhollerL(P7Q1):
9156320Sbholler	mov    -0xf(%rdx),%r10
9166320Sbholler	mov    %r10,-0xf(%rcx)
9176320Sbholler	/*
9186320Sbholler	 * These trailing loads/stores have to do all their loads 1st,
9196320Sbholler	 * then do the stores.
9206320Sbholler	 */
9216320SbhollerL(P7Q0):
9226320Sbholler	mov    -0x7(%rdx),%r9d
9236320Sbholler	movzwq -0x3(%rdx),%r10
9246320Sbholler	movzbq -0x1(%rdx),%r11
9256320Sbholler	mov    %r9d,-0x7(%rcx)
9266320Sbholler	mov    %r10w,-0x3(%rcx)
9276320Sbholler	mov    %r11b,-0x1(%rcx)
9286320Sbholler	ret
9296320Sbholler
9306320Sbholler	.balign 16
9316320SbhollerL(ck_use_sse2):
9326320Sbholler	/*
9336320Sbholler	 * Align dest to 16 byte boundary.
9346320Sbholler	 */
9356320Sbholler	test   $0xf,%rcx
9366320Sbholler	jnz    L(ShrtAlignNew)
9376320Sbholler
9386320SbhollerL(now_qw_aligned):
9396320Sbholler	cmpl   $NO_SSE,.memops_method(%rip)
9406320Sbholler	je     L(Loop8byte_pre)
9416320Sbholler
9426320Sbholler	/*
9436320Sbholler	 * The fall-through path is to do SSE2 16-byte load/stores
9446320Sbholler	 */
9456320Sbholler
9466320Sbholler	/*
9476320Sbholler	 * If current move size is larger than half of the highest level cache
9486320Sbholler	 * size, then do non-temporal moves.
9496320Sbholler	 */
9506320Sbholler	mov    .largest_level_cache_size(%rip),%r9d
9516320Sbholler	shr    %r9		# take half of it
9526320Sbholler	cmp    %r9,%r8
9536320Sbholler	jg     L(sse2_nt_move)
9546320Sbholler
9556320Sbholler	/*
9566320Sbholler	 * If both the source and dest are aligned, then use the both aligned
9576320Sbholler	 * logic. Well aligned data should reap the rewards.
9586320Sbholler	 */
9596320Sbholler	test   $0xf,%rdx
9606320Sbholler	jz     L(pre_both_aligned)
9616320Sbholler
9626320Sbholler	lea    L(SSE_src)(%rip),%r10		# SSE2 (default)
9636320Sbholler	testl  $USE_SSSE3,.memops_method(%rip)
9646320Sbholler	jz     1f
9656320Sbholler	lea    L(SSSE3_src)(%rip),%r10		# SSSE3
9666320Sbholler
9676320Sbholler1:
9686320Sbholler	/*
9696320Sbholler	 * if the src is not 16 byte aligned...
9706320Sbholler	 */
9716320Sbholler	mov    %rdx,%r11
9726320Sbholler	and    $0xf,%r11
9736320Sbholler	movdqu (%rdx),%xmm0
9746320Sbholler	movdqa %xmm0,(%rcx)
9756320Sbholler	add    $0x10,%rdx
9766320Sbholler	sub    %r11,%rdx
9776320Sbholler	add    $0x10,%rcx
9786320Sbholler	sub    $0x10,%r8
9796320Sbholler	movdqa (%rdx),%xmm1
9806320Sbholler
9816320Sbholler	movslq (%r10,%r11,4),%r9
9826320Sbholler	lea    (%r9,%r10,1),%r10
9836320Sbholler	jmpq   *%r10
9846320Sbholler
9856320Sbholler	    .balign 16
9866320SbhollerL(SSSE3_src): .int	L(pre_both_aligned)-L(SSSE3_src)
9876320Sbholler	    .int        L(mov3dqa1) -L(SSSE3_src)
9886320Sbholler	    .int        L(mov3dqa2) -L(SSSE3_src)
9896320Sbholler	    .int        L(mov3dqa3) -L(SSSE3_src)
9906320Sbholler	    .int        L(mov3dqa4) -L(SSSE3_src)
9916320Sbholler	    .int        L(mov3dqa5) -L(SSSE3_src)
9926320Sbholler	    .int        L(mov3dqa6) -L(SSSE3_src)
9936320Sbholler	    .int        L(mov3dqa7) -L(SSSE3_src)
9946320Sbholler	    .int        L(movdqa8)  -L(SSSE3_src)
9956320Sbholler	    .int        L(mov3dqa9) -L(SSSE3_src)
9966320Sbholler	    .int        L(mov3dqa10)-L(SSSE3_src)
9976320Sbholler	    .int        L(mov3dqa11)-L(SSSE3_src)
9986320Sbholler	    .int        L(mov3dqa12)-L(SSSE3_src)
9996320Sbholler	    .int        L(mov3dqa13)-L(SSSE3_src)
10006320Sbholler	    .int        L(mov3dqa14)-L(SSSE3_src)
10016320Sbholler	    .int        L(mov3dqa15)-L(SSSE3_src)
10026320SbhollerL(SSE_src): .int    L(pre_both_aligned)-L(SSE_src)
10036320Sbholler	    .int        L(movdqa1) -L(SSE_src)
10046320Sbholler	    .int        L(movdqa2) -L(SSE_src)
10056320Sbholler	    .int        L(movdqa3) -L(SSE_src)
10066320Sbholler	    .int        L(movdqa4) -L(SSE_src)
10076320Sbholler	    .int        L(movdqa5) -L(SSE_src)
10086320Sbholler	    .int        L(movdqa6) -L(SSE_src)
10096320Sbholler	    .int        L(movdqa7) -L(SSE_src)
10106320Sbholler	    .int        L(movdqa8) -L(SSE_src)
10116320Sbholler	    .int        L(movdqa9) -L(SSE_src)
10126320Sbholler	    .int        L(movdqa10)-L(SSE_src)
10136320Sbholler	    .int        L(movdqa11)-L(SSE_src)
10146320Sbholler	    .int        L(movdqa12)-L(SSE_src)
10156320Sbholler	    .int        L(movdqa13)-L(SSE_src)
10166320Sbholler	    .int        L(movdqa14)-L(SSE_src)
10176320Sbholler	    .int        L(movdqa15)-L(SSE_src)
10186320Sbholler
10196320Sbholler	.balign 16
10206320SbhollerL(movdqa1):
10216320Sbholler	movdqa 0x10(%rdx),%xmm3 # load the upper source buffer
10226320Sbholler	movdqa 0x20(%rdx),%xmm0 # load the upper source buffer
10236320Sbholler	lea    0x20(%rdx),%rdx
10246320Sbholler	lea    -0x20(%r8),%r8
10256320Sbholler
10266320Sbholler	psrldq $0x1,%xmm1  # shift right prev buffer (saved from last iteration)
10276320Sbholler	movdqa %xmm3,%xmm2 # store off xmm reg for use next iteration
10286320Sbholler	pslldq $0xf,%xmm3  # shift the current buffer left (shift in zeros)
10296320Sbholler	por    %xmm1,%xmm3 # OR them together
10306320Sbholler	cmp    $0x20,%r8
10316320Sbholler
10326320Sbholler	psrldq $0x1,%xmm2  # shift right prev buffer (saved from last iteration)
10336320Sbholler	movdqa %xmm0,%xmm1 # store off xmm reg for use next iteration
10346320Sbholler	pslldq $0xf,%xmm0  # shift the current buffer left (shift in zeros)
10356320Sbholler	por    %xmm2,%xmm0 # OR them together
10366320Sbholler	movdqa %xmm3,(%rcx)     # store it
10376320Sbholler	movdqa %xmm0,0x10(%rcx) # store it
10386320Sbholler	lea    0x20(%rcx),%rcx
10396320Sbholler
10406320Sbholler	jge    L(movdqa1)
10416320Sbholler	jmp    L(movdqa_epi)
10426320Sbholler
10436320Sbholler	.balign 16
10446320SbhollerL(movdqa2):
10456320Sbholler	sub    $0x20,%r8
10466320Sbholler	movdqa 0x10(%rdx),%xmm3
10476320Sbholler	movdqa 0x20(%rdx),%xmm0
10486320Sbholler	add    $0x20,%rdx
10496320Sbholler
10506320Sbholler	psrldq $0x2,%xmm1
10516320Sbholler	movdqa %xmm3,%xmm2
10526320Sbholler	pslldq $0xe,%xmm3
10536320Sbholler	por    %xmm1,%xmm3
10546320Sbholler
10556320Sbholler	psrldq $0x2,%xmm2
10566320Sbholler	movdqa %xmm0,%xmm1
10576320Sbholler	pslldq $0xe,%xmm0
10586320Sbholler	por    %xmm2,%xmm0
10596320Sbholler	movdqa %xmm3,(%rcx)
10606320Sbholler	movdqa %xmm0,0x10(%rcx)
10616320Sbholler
10626320Sbholler	add    $0x20,%rcx
10636320Sbholler	cmp    $0x20,%r8
10646320Sbholler	jge    L(movdqa2)
10656320Sbholler	jmp    L(movdqa_epi)
10666320Sbholler
10676320Sbholler	.balign 16
10686320SbhollerL(movdqa3):
10696320Sbholler	sub    $0x20,%r8
10706320Sbholler	movdqa 0x10(%rdx),%xmm3
10716320Sbholler	movdqa 0x20(%rdx),%xmm0
10726320Sbholler	add    $0x20,%rdx
10736320Sbholler
10746320Sbholler	psrldq $0x3,%xmm1
10756320Sbholler	movdqa %xmm3,%xmm2
10766320Sbholler	pslldq $0xd,%xmm3
10776320Sbholler	por    %xmm1,%xmm3
10786320Sbholler
10796320Sbholler	psrldq $0x3,%xmm2
10806320Sbholler	movdqa %xmm0,%xmm1
10816320Sbholler	pslldq $0xd,%xmm0
10826320Sbholler	por    %xmm2,%xmm0
10836320Sbholler	movdqa %xmm3,(%rcx)
10846320Sbholler	movdqa %xmm0,0x10(%rcx)
10856320Sbholler
10866320Sbholler	add    $0x20,%rcx
10876320Sbholler	cmp    $0x20,%r8
10886320Sbholler	jge    L(movdqa3)
10896320Sbholler	jmp    L(movdqa_epi)
10906320Sbholler
10916320Sbholler	.balign 16
10926320SbhollerL(movdqa4):
10936320Sbholler	sub    $0x20,%r8
10946320Sbholler	movdqa 0x10(%rdx),%xmm3
10956320Sbholler	movdqa 0x20(%rdx),%xmm0
10966320Sbholler	add    $0x20,%rdx
10976320Sbholler
10986320Sbholler	psrldq $0x4,%xmm1
10996320Sbholler	movdqa %xmm3,%xmm2
11006320Sbholler	pslldq $0xc,%xmm3
11016320Sbholler	por    %xmm1,%xmm3
11026320Sbholler
11036320Sbholler	psrldq $0x4,%xmm2
11046320Sbholler	movdqa %xmm0,%xmm1
11056320Sbholler	pslldq $0xc,%xmm0
11066320Sbholler	por    %xmm2,%xmm0
11076320Sbholler
11086320Sbholler	movdqa %xmm3,(%rcx)
11096320Sbholler	movdqa %xmm0,0x10(%rcx)
11106320Sbholler
11116320Sbholler	add    $0x20,%rcx
11126320Sbholler	cmp    $0x20,%r8
11136320Sbholler	jge    L(movdqa4)
11146320Sbholler	jmp    L(movdqa_epi)
11156320Sbholler
11166320Sbholler	.balign 16
11176320SbhollerL(movdqa5):
11186320Sbholler	sub    $0x20,%r8
11196320Sbholler	movdqa 0x10(%rdx),%xmm3
11206320Sbholler	movdqa 0x20(%rdx),%xmm0
11216320Sbholler	add    $0x20,%rdx
11226320Sbholler
11236320Sbholler	psrldq $0x5,%xmm1
11246320Sbholler	movdqa %xmm3,%xmm2
11256320Sbholler	pslldq $0xb,%xmm3
11266320Sbholler	por    %xmm1,%xmm3
11276320Sbholler
11286320Sbholler	psrldq $0x5,%xmm2
11296320Sbholler	movdqa %xmm0,%xmm1
11306320Sbholler	pslldq $0xb,%xmm0
11316320Sbholler	por    %xmm2,%xmm0
11326320Sbholler
11336320Sbholler	movdqa %xmm3,(%rcx)
11346320Sbholler	movdqa %xmm0,0x10(%rcx)
11356320Sbholler
11366320Sbholler	add    $0x20,%rcx
11376320Sbholler	cmp    $0x20,%r8
11386320Sbholler	jge    L(movdqa5)
11396320Sbholler	jmp    L(movdqa_epi)
11406320Sbholler
11416320Sbholler	.balign 16
11426320SbhollerL(movdqa6):
11436320Sbholler	sub    $0x20,%r8
11446320Sbholler	movdqa 0x10(%rdx),%xmm3
11456320Sbholler	movdqa 0x20(%rdx),%xmm0
11466320Sbholler	add    $0x20,%rdx
11476320Sbholler
11486320Sbholler	psrldq $0x6,%xmm1
11496320Sbholler	movdqa %xmm3,%xmm2
11506320Sbholler	pslldq $0xa,%xmm3
11516320Sbholler	por    %xmm1,%xmm3
11526320Sbholler
11536320Sbholler	psrldq $0x6,%xmm2
11546320Sbholler	movdqa %xmm0,%xmm1
11556320Sbholler	pslldq $0xa,%xmm0
11566320Sbholler	por    %xmm2,%xmm0
11576320Sbholler	movdqa %xmm3,(%rcx)
11586320Sbholler	movdqa %xmm0,0x10(%rcx)
11596320Sbholler
11606320Sbholler	add    $0x20,%rcx
11616320Sbholler	cmp    $0x20,%r8
11626320Sbholler	jge    L(movdqa6)
11636320Sbholler	jmp    L(movdqa_epi)
11646320Sbholler
11656320Sbholler	.balign 16
11666320SbhollerL(movdqa7):
11676320Sbholler	sub    $0x20,%r8
11686320Sbholler	movdqa 0x10(%rdx),%xmm3
11696320Sbholler	movdqa 0x20(%rdx),%xmm0
11706320Sbholler	add    $0x20,%rdx
11716320Sbholler
11726320Sbholler	psrldq $0x7,%xmm1
11736320Sbholler	movdqa %xmm3,%xmm2
11746320Sbholler	pslldq $0x9,%xmm3
11756320Sbholler	por    %xmm1,%xmm3
11766320Sbholler
11776320Sbholler	psrldq $0x7,%xmm2
11786320Sbholler	movdqa %xmm0,%xmm1
11796320Sbholler	pslldq $0x9,%xmm0
11806320Sbholler	por    %xmm2,%xmm0
11816320Sbholler	movdqa %xmm3,(%rcx)
11826320Sbholler	movdqa %xmm0,0x10(%rcx)
11836320Sbholler
11846320Sbholler	add    $0x20,%rcx
11856320Sbholler	cmp    $0x20,%r8
11866320Sbholler	jge    L(movdqa7)
11876320Sbholler	jmp    L(movdqa_epi)
11886320Sbholler
11896320Sbholler	.balign 16
11906320SbhollerL(movdqa8):
11916320Sbholler	movdqa 0x10(%rdx),%xmm3
11926320Sbholler	sub    $0x30,%r8
11936320Sbholler	movdqa 0x20(%rdx),%xmm0
11946320Sbholler	movdqa 0x30(%rdx),%xmm5
11956320Sbholler	lea    0x30(%rdx),%rdx
11966320Sbholler
11976320Sbholler	shufpd $0x1,%xmm3,%xmm1
11986320Sbholler	movdqa %xmm1,(%rcx)
11996320Sbholler
12006320Sbholler	cmp    $0x30,%r8
12016320Sbholler
12026320Sbholler	shufpd $0x1,%xmm0,%xmm3
12036320Sbholler	movdqa %xmm3,0x10(%rcx)
12046320Sbholler
12056320Sbholler	movdqa %xmm5,%xmm1
12066320Sbholler	shufpd $0x1,%xmm5,%xmm0
12076320Sbholler	movdqa %xmm0,0x20(%rcx)
12086320Sbholler
12096320Sbholler	lea    0x30(%rcx),%rcx
12106320Sbholler
12116320Sbholler	jge    L(movdqa8)
12126320Sbholler	jmp    L(movdqa_epi)
12136320Sbholler
12146320Sbholler	.balign 16
12156320SbhollerL(movdqa9):
12166320Sbholler	sub    $0x20,%r8
12176320Sbholler	movdqa 0x10(%rdx),%xmm3
12186320Sbholler	movdqa 0x20(%rdx),%xmm0
12196320Sbholler	add    $0x20,%rdx
12206320Sbholler
12216320Sbholler	psrldq $0x9,%xmm1
12226320Sbholler	movdqa %xmm3,%xmm2
12236320Sbholler	pslldq $0x7,%xmm3
12246320Sbholler	por    %xmm1,%xmm3
12256320Sbholler
12266320Sbholler	psrldq $0x9,%xmm2
12276320Sbholler	movdqa %xmm0,%xmm1
12286320Sbholler	pslldq $0x7,%xmm0
12296320Sbholler	por    %xmm2,%xmm0
12306320Sbholler	movdqa %xmm3,(%rcx)
12316320Sbholler	movdqa %xmm0,0x10(%rcx)
12326320Sbholler
12336320Sbholler	add    $0x20,%rcx
12346320Sbholler	cmp    $0x20,%r8
12356320Sbholler	jge    L(movdqa9)
12366320Sbholler	jmp    L(movdqa_epi)
12376320Sbholler
12386320Sbholler	.balign 16
12396320SbhollerL(movdqa10):
12406320Sbholler	sub    $0x20,%r8
12416320Sbholler	movdqa 0x10(%rdx),%xmm3
12426320Sbholler	movdqa 0x20(%rdx),%xmm0
12436320Sbholler	add    $0x20,%rdx
12446320Sbholler
12456320Sbholler	psrldq $0xa,%xmm1
12466320Sbholler	movdqa %xmm3,%xmm2
12476320Sbholler	pslldq $0x6,%xmm3
12486320Sbholler	por    %xmm1,%xmm3
12496320Sbholler
12506320Sbholler	psrldq $0xa,%xmm2
12516320Sbholler	movdqa %xmm0,%xmm1
12526320Sbholler	pslldq $0x6,%xmm0
12536320Sbholler	por    %xmm2,%xmm0
12546320Sbholler	movdqa %xmm3,(%rcx)
12556320Sbholler	movdqa %xmm0,0x10(%rcx)
12566320Sbholler
12576320Sbholler	add    $0x20,%rcx
12586320Sbholler	cmp    $0x20,%r8
12596320Sbholler	jge    L(movdqa10)
12606320Sbholler	jmp    L(movdqa_epi)
12616320Sbholler
12626320Sbholler	.balign 16
12636320SbhollerL(movdqa11):
12646320Sbholler	sub    $0x20,%r8
12656320Sbholler	movdqa 0x10(%rdx),%xmm3
12666320Sbholler	movdqa 0x20(%rdx),%xmm0
12676320Sbholler	add    $0x20,%rdx
12686320Sbholler
12696320Sbholler	psrldq $0xb,%xmm1
12706320Sbholler	movdqa %xmm3,%xmm2
12716320Sbholler	pslldq $0x5,%xmm3
12726320Sbholler	por    %xmm1,%xmm3
12736320Sbholler
12746320Sbholler	psrldq $0xb,%xmm2
12756320Sbholler	movdqa %xmm0,%xmm1
12766320Sbholler	pslldq $0x5,%xmm0
12776320Sbholler	por    %xmm2,%xmm0
12786320Sbholler	movdqa %xmm3,(%rcx)
12796320Sbholler	movdqa %xmm0,0x10(%rcx)
12806320Sbholler
12816320Sbholler	add    $0x20,%rcx
12826320Sbholler	cmp    $0x20,%r8
12836320Sbholler	jge    L(movdqa11)
12846320Sbholler	jmp    L(movdqa_epi)
12856320Sbholler
12866320Sbholler	.balign 16
12876320SbhollerL(movdqa12):
12886320Sbholler	sub    $0x20,%r8
12896320Sbholler	movdqa 0x10(%rdx),%xmm3
12906320Sbholler	movdqa 0x20(%rdx),%xmm0
12916320Sbholler	add    $0x20,%rdx
12926320Sbholler
12936320Sbholler	psrldq $0xc,%xmm1
12946320Sbholler	movdqa %xmm3,%xmm2
12956320Sbholler	pslldq $0x4,%xmm3
12966320Sbholler	por    %xmm1,%xmm3
12976320Sbholler
12986320Sbholler	psrldq $0xc,%xmm2
12996320Sbholler	movdqa %xmm0,%xmm1
13006320Sbholler	pslldq $0x4,%xmm0
13016320Sbholler	por    %xmm2,%xmm0
13026320Sbholler	movdqa %xmm3,(%rcx)
13036320Sbholler	movdqa %xmm0,0x10(%rcx)
13046320Sbholler
13056320Sbholler	add    $0x20,%rcx
13066320Sbholler	cmp    $0x20,%r8
13076320Sbholler	jge    L(movdqa12)
13086320Sbholler	jmp    L(movdqa_epi)
13096320Sbholler
13106320Sbholler	.balign 16
13116320SbhollerL(movdqa13):
13126320Sbholler	sub    $0x20,%r8
13136320Sbholler	movdqa 0x10(%rdx),%xmm3
13146320Sbholler	movdqa 0x20(%rdx),%xmm0
13156320Sbholler	add    $0x20,%rdx
13166320Sbholler
13176320Sbholler	psrldq $0xd,%xmm1
13186320Sbholler	movdqa %xmm3,%xmm2
13196320Sbholler	pslldq $0x3,%xmm3
13206320Sbholler	por    %xmm1,%xmm3
13216320Sbholler
13226320Sbholler	psrldq $0xd,%xmm2
13236320Sbholler	movdqa %xmm0,%xmm1
13246320Sbholler	pslldq $0x3,%xmm0
13256320Sbholler	por    %xmm2,%xmm0
13266320Sbholler	movdqa %xmm3,(%rcx)
13276320Sbholler	movdqa %xmm0,0x10(%rcx)
13286320Sbholler
13296320Sbholler	add    $0x20,%rcx
13306320Sbholler	cmp    $0x20,%r8
13316320Sbholler	jge    L(movdqa13)
13326320Sbholler	jmp    L(movdqa_epi)
13336320Sbholler
13346320Sbholler	.balign 16
13356320SbhollerL(movdqa14):
13366320Sbholler	sub    $0x20,%r8
13376320Sbholler	movdqa 0x10(%rdx),%xmm3
13386320Sbholler	movdqa 0x20(%rdx),%xmm0
13396320Sbholler	add    $0x20,%rdx
13406320Sbholler
13416320Sbholler	psrldq $0xe,%xmm1
13426320Sbholler	movdqa %xmm3,%xmm2
13436320Sbholler	pslldq $0x2,%xmm3
13446320Sbholler	por    %xmm1,%xmm3
13456320Sbholler
13466320Sbholler	psrldq $0xe,%xmm2
13476320Sbholler	movdqa %xmm0,%xmm1
13486320Sbholler	pslldq $0x2,%xmm0
13496320Sbholler	por    %xmm2,%xmm0
13506320Sbholler	movdqa %xmm3,(%rcx)
13516320Sbholler	movdqa %xmm0,0x10(%rcx)
13526320Sbholler
13536320Sbholler	add    $0x20,%rcx
13546320Sbholler	cmp    $0x20,%r8
13556320Sbholler	jge    L(movdqa14)
13566320Sbholler	jmp    L(movdqa_epi)
13576320Sbholler
13586320Sbholler	.balign 16
13596320SbhollerL(movdqa15):
13606320Sbholler	sub    $0x20,%r8
13616320Sbholler	movdqa 0x10(%rdx),%xmm3
13626320Sbholler	movdqa 0x20(%rdx),%xmm0
13636320Sbholler	add    $0x20,%rdx
13646320Sbholler
13656320Sbholler	psrldq $0xf,%xmm1
13666320Sbholler	movdqa %xmm3,%xmm2
13676320Sbholler	pslldq $0x1,%xmm3
13686320Sbholler	por    %xmm1,%xmm3
13696320Sbholler
13706320Sbholler	psrldq $0xf,%xmm2
13716320Sbholler	movdqa %xmm0,%xmm1
13726320Sbholler	pslldq $0x1,%xmm0
13736320Sbholler	por    %xmm2,%xmm0
13746320Sbholler	movdqa %xmm3,(%rcx)
13756320Sbholler	movdqa %xmm0,0x10(%rcx)
13766320Sbholler
13776320Sbholler	add    $0x20,%rcx
13786320Sbholler	cmp    $0x20,%r8
13796320Sbholler	jge    L(movdqa15)
13806320Sbholler	#jmp   L(movdqa_epi)
13816320Sbholler
13826320Sbholler	.balign 16
13836320SbhollerL(movdqa_epi):
13846320Sbholler	lea    L(fwdPxQx)(%rip),%r10
13856320Sbholler	add    %r11,%rdx # bump rdx to the right addr (it lagged behind in the above loop)
13866320Sbholler	add    %r8,%rcx
13876320Sbholler	add    %r8,%rdx
13886320Sbholler
13896320Sbholler	movslq (%r10,%r8,4),%r9
13906320Sbholler	lea    (%r9,%r10,1),%r10
13916320Sbholler	jmpq   *%r10
13926320Sbholler
13936320Sbholler	.balign 16
13946320SbhollerL(mov3dqa1):
13956320Sbholler	movdqa	0x10(%rdx),%xmm3 # load the upper source buffer
13966320Sbholler	sub	$0x30,%r8
13976320Sbholler	movdqa	0x20(%rdx),%xmm0 # load the upper source buffer
13986320Sbholler	movdqa	0x30(%rdx),%xmm5 # load the upper source buffer
13996320Sbholler	lea	0x30(%rdx),%rdx
14006320Sbholler	cmp	$0x30,%r8
14016320Sbholler
14026320Sbholler	movdqa	%xmm3,%xmm2       # store off xmm reg for use next iteration
14036320Sbholler	#palignr	$0x1,%xmm1,%xmm3
14046320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
14056320Sbholler	.byte	0xd9,0x01
14066320Sbholler	movdqa	%xmm3,(%rcx)      # store it
14076320Sbholler
14086320Sbholler	movdqa	%xmm0,%xmm4       # store off xmm reg for use next iteration
14096320Sbholler	#palignr	$0x1,%xmm2,%xmm0
14106320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
14116320Sbholler	.byte	0xc2,0x01
14126320Sbholler	movdqa	%xmm0,0x10(%rcx)  # store it
14136320Sbholler
14146320Sbholler	movdqa	%xmm5,%xmm1       # store off xmm reg for use next iteration
14156320Sbholler	#palignr	$0x1,%xmm4,%xmm5
14166320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
14176320Sbholler	.byte	0xec,0x01
14186320Sbholler	movdqa	%xmm5,0x20(%rcx)  # store it
14196320Sbholler
14206320Sbholler	lea	0x30(%rcx),%rcx
14216320Sbholler	jge	L(mov3dqa1)
14226320Sbholler
14236320Sbholler	cmp	$0x10,%r8
14246320Sbholler	jl	L(movdqa_epi)
14256320Sbholler	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
14266320Sbholler	sub	$0x10,%r8
14276320Sbholler	lea	0x10(%rdx),%rdx
14286320Sbholler	movdqa	%xmm3,%xmm2		# save for use next concat
14296320Sbholler	#palignr	$0x1,%xmm1,%xmm3
14306320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
14316320Sbholler	.byte	0xd9,0x01
14326320Sbholler
14336320Sbholler	cmp	$0x10,%r8
14346320Sbholler	movdqa	%xmm3,(%rcx)      	# store it
14356320Sbholler	lea	0x10(%rcx),%rcx
14366320Sbholler	jl	L(movdqa_epi)
14376320Sbholler
14386320Sbholler	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
14396320Sbholler	sub	$0x10,%r8
14406320Sbholler	lea	0x10(%rdx),%rdx
14416320Sbholler	#palignr	$0x1,%xmm2,%xmm0
14426320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
14436320Sbholler	.byte	0xc2,0x01
14446320Sbholler	movdqa	%xmm0,(%rcx)      	# store it
14456320Sbholler	lea	0x10(%rcx),%rcx
14466320Sbholler	jmp	L(movdqa_epi)
14476320Sbholler
14486320Sbholler	.balign 16
14496320SbhollerL(mov3dqa2):
14506320Sbholler	movdqa	0x10(%rdx),%xmm3
14516320Sbholler	sub	$0x30,%r8
14526320Sbholler	movdqa	0x20(%rdx),%xmm0
14536320Sbholler	movdqa	0x30(%rdx),%xmm5
14546320Sbholler	lea	0x30(%rdx),%rdx
14556320Sbholler	cmp	$0x30,%r8
14566320Sbholler
14576320Sbholler	movdqa	%xmm3,%xmm2
14586320Sbholler	#palignr	$0x2,%xmm1,%xmm3
14596320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
14606320Sbholler	.byte	0xd9,0x02
14616320Sbholler	movdqa	%xmm3,(%rcx)
14626320Sbholler
14636320Sbholler	movdqa	%xmm0,%xmm4
14646320Sbholler	#palignr	$0x2,%xmm2,%xmm0
14656320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
14666320Sbholler	.byte	0xc2,0x02
14676320Sbholler	movdqa	%xmm0,0x10(%rcx)
14686320Sbholler
14696320Sbholler	movdqa	%xmm5,%xmm1
14706320Sbholler	#palignr	$0x2,%xmm4,%xmm5
14716320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
14726320Sbholler	.byte	0xec,0x02
14736320Sbholler	movdqa	%xmm5,0x20(%rcx)
14746320Sbholler
14756320Sbholler	lea	0x30(%rcx),%rcx
14766320Sbholler	jge	L(mov3dqa2)
14776320Sbholler
14786320Sbholler	cmp	$0x10,%r8
14796320Sbholler	jl	L(movdqa_epi)
14806320Sbholler	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
14816320Sbholler	sub	$0x10,%r8
14826320Sbholler	lea	0x10(%rdx),%rdx
14836320Sbholler	movdqa	%xmm3,%xmm2		# save for use next concat
14846320Sbholler	#palignr	$0x2,%xmm1,%xmm3
14856320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
14866320Sbholler	.byte	0xd9,0x02
14876320Sbholler
14886320Sbholler	cmp	$0x10,%r8
14896320Sbholler	movdqa	%xmm3,(%rcx)      	# store it
14906320Sbholler	lea	0x10(%rcx),%rcx
14916320Sbholler	jl	L(movdqa_epi)
14926320Sbholler
14936320Sbholler	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
14946320Sbholler	sub	$0x10,%r8
14956320Sbholler	lea	0x10(%rdx),%rdx
14966320Sbholler	#palignr	$0x2,%xmm2,%xmm0
14976320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
14986320Sbholler	.byte	0xc2,0x02
14996320Sbholler	movdqa	%xmm0,(%rcx)      	# store it
15006320Sbholler	lea	0x10(%rcx),%rcx
15016320Sbholler	jmp	L(movdqa_epi)
15026320Sbholler
15036320Sbholler	.balign 16
15046320SbhollerL(mov3dqa3):
15056320Sbholler	movdqa	0x10(%rdx),%xmm3
15066320Sbholler	sub	$0x30,%r8
15076320Sbholler	movdqa	0x20(%rdx),%xmm0
15086320Sbholler	movdqa	0x30(%rdx),%xmm5
15096320Sbholler	lea	0x30(%rdx),%rdx
15106320Sbholler	cmp	$0x30,%r8
15116320Sbholler
15126320Sbholler	movdqa	%xmm3,%xmm2
15136320Sbholler	#palignr	$0x3,%xmm1,%xmm3
15146320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
15156320Sbholler	.byte	0xd9,0x03
15166320Sbholler	movdqa	%xmm3,(%rcx)
15176320Sbholler
15186320Sbholler	movdqa	%xmm0,%xmm4
15196320Sbholler	#palignr	$0x3,%xmm2,%xmm0
15206320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
15216320Sbholler	.byte	0xc2,0x03
15226320Sbholler	movdqa	%xmm0,0x10(%rcx)
15236320Sbholler
15246320Sbholler	movdqa	%xmm5,%xmm1
15256320Sbholler	#palignr	$0x3,%xmm4,%xmm5
15266320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
15276320Sbholler	.byte	0xec,0x03
15286320Sbholler	movdqa	%xmm5,0x20(%rcx)
15296320Sbholler
15306320Sbholler	lea	0x30(%rcx),%rcx
15316320Sbholler	jge	L(mov3dqa3)
15326320Sbholler
15336320Sbholler	cmp	$0x10,%r8
15346320Sbholler	jl	L(movdqa_epi)
15356320Sbholler	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
15366320Sbholler	sub	$0x10,%r8
15376320Sbholler	lea	0x10(%rdx),%rdx
15386320Sbholler	movdqa	%xmm3,%xmm2		# save for use next concat
15396320Sbholler	#palignr	$0x3,%xmm1,%xmm3
15406320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
15416320Sbholler	.byte	0xd9,0x03
15426320Sbholler
15436320Sbholler	cmp	$0x10,%r8
15446320Sbholler	movdqa	%xmm3,(%rcx)      	# store it
15456320Sbholler	lea	0x10(%rcx),%rcx
15466320Sbholler	jl	L(movdqa_epi)
15476320Sbholler
15486320Sbholler	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
15496320Sbholler	sub	$0x10,%r8
15506320Sbholler	lea	0x10(%rdx),%rdx
15516320Sbholler	#palignr	$0x3,%xmm2,%xmm0
15526320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
15536320Sbholler	.byte	0xc2,0x03
15546320Sbholler	movdqa	%xmm0,(%rcx)      	# store it
15556320Sbholler	lea	0x10(%rcx),%rcx
15566320Sbholler	jmp	L(movdqa_epi)
15576320Sbholler
15586320Sbholler	.balign 16
15596320SbhollerL(mov3dqa4):
15606320Sbholler	movdqa	0x10(%rdx),%xmm3
15616320Sbholler	sub	$0x30,%r8
15626320Sbholler	movdqa	0x20(%rdx),%xmm0
15636320Sbholler	movdqa	0x30(%rdx),%xmm5
15646320Sbholler	lea	0x30(%rdx),%rdx
15656320Sbholler	cmp	$0x30,%r8
15666320Sbholler
15676320Sbholler	movdqa	%xmm3,%xmm2
15686320Sbholler	#palignr	$0x4,%xmm1,%xmm3
15696320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
15706320Sbholler	.byte	0xd9,0x04
15716320Sbholler	movdqa	%xmm3,(%rcx)
15726320Sbholler
15736320Sbholler	movdqa	%xmm0,%xmm4
15746320Sbholler	#palignr	$0x4,%xmm2,%xmm0
15756320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
15766320Sbholler	.byte	0xc2,0x04
15776320Sbholler	movdqa	%xmm0,0x10(%rcx)
15786320Sbholler
15796320Sbholler	movdqa	%xmm5,%xmm1
15806320Sbholler	#palignr	$0x4,%xmm4,%xmm5
15816320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
15826320Sbholler	.byte	0xec,0x04
15836320Sbholler	movdqa	%xmm5,0x20(%rcx)
15846320Sbholler
15856320Sbholler	lea	0x30(%rcx),%rcx
15866320Sbholler	jge	L(mov3dqa4)
15876320Sbholler
15886320Sbholler	cmp	$0x10,%r8
15896320Sbholler	jl	L(movdqa_epi)
15906320Sbholler	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
15916320Sbholler	sub	$0x10,%r8
15926320Sbholler	lea	0x10(%rdx),%rdx
15936320Sbholler	movdqa	%xmm3,%xmm2		# save for use next concat
15946320Sbholler	#palignr	$0x4,%xmm1,%xmm3
15956320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
15966320Sbholler	.byte	0xd9,0x04
15976320Sbholler
15986320Sbholler	cmp	$0x10,%r8
15996320Sbholler	movdqa	%xmm3,(%rcx)      	# store it
16006320Sbholler	lea	0x10(%rcx),%rcx
16016320Sbholler	jl	L(movdqa_epi)
16026320Sbholler
16036320Sbholler	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
16046320Sbholler	sub	$0x10,%r8
16056320Sbholler	lea	0x10(%rdx),%rdx
16066320Sbholler	#palignr	$0x4,%xmm2,%xmm0
16076320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
16086320Sbholler	.byte	0xc2,0x04
16096320Sbholler	movdqa	%xmm0,(%rcx)      	# store it
16106320Sbholler	lea	0x10(%rcx),%rcx
16116320Sbholler	jmp	L(movdqa_epi)
16126320Sbholler
16136320Sbholler	.balign 16
16146320SbhollerL(mov3dqa5):
16156320Sbholler	movdqa	0x10(%rdx),%xmm3
16166320Sbholler	sub	$0x30,%r8
16176320Sbholler	movdqa	0x20(%rdx),%xmm0
16186320Sbholler	movdqa	0x30(%rdx),%xmm5
16196320Sbholler	lea	0x30(%rdx),%rdx
16206320Sbholler	cmp	$0x30,%r8
16216320Sbholler
16226320Sbholler	movdqa	%xmm3,%xmm2
16236320Sbholler	#palignr	$0x5,%xmm1,%xmm3
16246320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
16256320Sbholler	.byte	0xd9,0x05
16266320Sbholler	movdqa	%xmm3,(%rcx)
16276320Sbholler
16286320Sbholler	movdqa	%xmm0,%xmm4
16296320Sbholler	#palignr	$0x5,%xmm2,%xmm0
16306320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
16316320Sbholler	.byte	0xc2,0x05
16326320Sbholler	movdqa	%xmm0,0x10(%rcx)
16336320Sbholler
16346320Sbholler	movdqa	%xmm5,%xmm1
16356320Sbholler	#palignr	$0x5,%xmm4,%xmm5
16366320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
16376320Sbholler	.byte	0xec,0x05
16386320Sbholler	movdqa	%xmm5,0x20(%rcx)
16396320Sbholler
16406320Sbholler	lea	0x30(%rcx),%rcx
16416320Sbholler	jge	L(mov3dqa5)
16426320Sbholler
16436320Sbholler	cmp	$0x10,%r8
16446320Sbholler	jl	L(movdqa_epi)
16456320Sbholler	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
16466320Sbholler	sub	$0x10,%r8
16476320Sbholler	lea	0x10(%rdx),%rdx
16486320Sbholler	movdqa	%xmm3,%xmm2		# save for use next concat
16496320Sbholler	#palignr	$0x5,%xmm1,%xmm3
16506320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
16516320Sbholler	.byte	0xd9,0x05
16526320Sbholler
16536320Sbholler	cmp	$0x10,%r8
16546320Sbholler	movdqa	%xmm3,(%rcx)      	# store it
16556320Sbholler	lea	0x10(%rcx),%rcx
16566320Sbholler	jl	L(movdqa_epi)
16576320Sbholler
16586320Sbholler	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
16596320Sbholler	sub	$0x10,%r8
16606320Sbholler	lea	0x10(%rdx),%rdx
16616320Sbholler	#palignr	$0x5,%xmm2,%xmm0
16626320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
16636320Sbholler	.byte	0xc2,0x05
16646320Sbholler	movdqa	%xmm0,(%rcx)      	# store it
16656320Sbholler	lea	0x10(%rcx),%rcx
16666320Sbholler	jmp	L(movdqa_epi)
16676320Sbholler
16686320Sbholler	.balign 16
16696320SbhollerL(mov3dqa6):
16706320Sbholler	movdqa	0x10(%rdx),%xmm3
16716320Sbholler	sub	$0x30,%r8
16726320Sbholler	movdqa	0x20(%rdx),%xmm0
16736320Sbholler	movdqa	0x30(%rdx),%xmm5
16746320Sbholler	lea	0x30(%rdx),%rdx
16756320Sbholler	cmp	$0x30,%r8
16766320Sbholler
16776320Sbholler	movdqa	%xmm3,%xmm2
16786320Sbholler	#palignr	$0x6,%xmm1,%xmm3
16796320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
16806320Sbholler	.byte	0xd9,0x06
16816320Sbholler	movdqa	%xmm3,(%rcx)
16826320Sbholler
16836320Sbholler	movdqa	%xmm0,%xmm4
16846320Sbholler	#palignr	$0x6,%xmm2,%xmm0
16856320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
16866320Sbholler	.byte	0xc2,0x06
16876320Sbholler	movdqa	%xmm0,0x10(%rcx)
16886320Sbholler
16896320Sbholler	movdqa	%xmm5,%xmm1
16906320Sbholler	#palignr	$0x6,%xmm4,%xmm5
16916320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
16926320Sbholler	.byte	0xec,0x06
16936320Sbholler	movdqa	%xmm5,0x20(%rcx)
16946320Sbholler
16956320Sbholler	lea	0x30(%rcx),%rcx
16966320Sbholler	jge	L(mov3dqa6)
16976320Sbholler
16986320Sbholler	cmp	$0x10,%r8
16996320Sbholler	jl	L(movdqa_epi)
17006320Sbholler	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
17016320Sbholler	sub	$0x10,%r8
17026320Sbholler	lea	0x10(%rdx),%rdx
17036320Sbholler	movdqa	%xmm3,%xmm2		# save for use next concat
17046320Sbholler	#palignr	$0x6,%xmm1,%xmm3
17056320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
17066320Sbholler	.byte	0xd9,0x06
17076320Sbholler
17086320Sbholler	cmp	$0x10,%r8
17096320Sbholler	movdqa	%xmm3,(%rcx)      	# store it
17106320Sbholler	lea	0x10(%rcx),%rcx
17116320Sbholler	jl	L(movdqa_epi)
17126320Sbholler
17136320Sbholler	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
17146320Sbholler	sub	$0x10,%r8
17156320Sbholler	lea	0x10(%rdx),%rdx
17166320Sbholler	#palignr	$0x6,%xmm2,%xmm0
17176320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
17186320Sbholler	.byte	0xc2,0x06
17196320Sbholler	movdqa	%xmm0,(%rcx)      	# store it
17206320Sbholler	lea	0x10(%rcx),%rcx
17216320Sbholler	jmp	L(movdqa_epi)
17226320Sbholler
17236320Sbholler	.balign 16
17246320SbhollerL(mov3dqa7):
17256320Sbholler	movdqa	0x10(%rdx),%xmm3
17266320Sbholler	sub	$0x30,%r8
17276320Sbholler	movdqa	0x20(%rdx),%xmm0
17286320Sbholler	movdqa	0x30(%rdx),%xmm5
17296320Sbholler	lea	0x30(%rdx),%rdx
17306320Sbholler	cmp	$0x30,%r8
17316320Sbholler
17326320Sbholler	movdqa	%xmm3,%xmm2
17336320Sbholler	#palignr	$0x7,%xmm1,%xmm3
17346320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
17356320Sbholler	.byte	0xd9,0x07
17366320Sbholler	movdqa	%xmm3,(%rcx)
17376320Sbholler
17386320Sbholler	movdqa	%xmm0,%xmm4
17396320Sbholler	#palignr	$0x7,%xmm2,%xmm0
17406320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
17416320Sbholler	.byte	0xc2,0x07
17426320Sbholler	movdqa	%xmm0,0x10(%rcx)
17436320Sbholler
17446320Sbholler	movdqa	%xmm5,%xmm1
17456320Sbholler	#palignr	$0x7,%xmm4,%xmm5
17466320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
17476320Sbholler	.byte	0xec,0x07
17486320Sbholler	movdqa	%xmm5,0x20(%rcx)
17496320Sbholler
17506320Sbholler	lea	0x30(%rcx),%rcx
17516320Sbholler	jge	L(mov3dqa7)
17526320Sbholler
17536320Sbholler	cmp	$0x10,%r8
17546320Sbholler	jl	L(movdqa_epi)
17556320Sbholler	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
17566320Sbholler	sub	$0x10,%r8
17576320Sbholler	lea	0x10(%rdx),%rdx
17586320Sbholler	movdqa	%xmm3,%xmm2		# save for use next concat
17596320Sbholler	#palignr	$0x7,%xmm1,%xmm3
17606320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
17616320Sbholler	.byte	0xd9,0x07
17626320Sbholler
17636320Sbholler	cmp	$0x10,%r8
17646320Sbholler	movdqa	%xmm3,(%rcx)      	# store it
17656320Sbholler	lea	0x10(%rcx),%rcx
17666320Sbholler	jl	L(movdqa_epi)
17676320Sbholler
17686320Sbholler	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
17696320Sbholler	sub	$0x10,%r8
17706320Sbholler	lea	0x10(%rdx),%rdx
17716320Sbholler	#palignr	$0x7,%xmm2,%xmm0
17726320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
17736320Sbholler	.byte	0xc2,0x07
17746320Sbholler	movdqa	%xmm0,(%rcx)      	# store it
17756320Sbholler	lea	0x10(%rcx),%rcx
17766320Sbholler	jmp	L(movdqa_epi)
17776320Sbholler
17786320Sbholler	.balign 16
17796320SbhollerL(mov3dqa9):
17806320Sbholler	movdqa	0x10(%rdx),%xmm3
17816320Sbholler	sub	$0x30,%r8
17826320Sbholler	movdqa	0x20(%rdx),%xmm0
17836320Sbholler	movdqa	0x30(%rdx),%xmm5
17846320Sbholler	lea	0x30(%rdx),%rdx
17856320Sbholler	cmp	$0x30,%r8
17866320Sbholler
17876320Sbholler	movdqa	%xmm3,%xmm2
17886320Sbholler	#palignr	$0x9,%xmm1,%xmm3
17896320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
17906320Sbholler	.byte	0xd9,0x09
17916320Sbholler	movdqa	%xmm3,(%rcx)
17926320Sbholler
17936320Sbholler	movdqa	%xmm0,%xmm4
17946320Sbholler	#palignr	$0x9,%xmm2,%xmm0
17956320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
17966320Sbholler	.byte	0xc2,0x09
17976320Sbholler	movdqa	%xmm0,0x10(%rcx)
17986320Sbholler
17996320Sbholler	movdqa	%xmm5,%xmm1
18006320Sbholler	#palignr	$0x9,%xmm4,%xmm5
18016320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
18026320Sbholler	.byte	0xec,0x09
18036320Sbholler	movdqa	%xmm5,0x20(%rcx)
18046320Sbholler
18056320Sbholler	lea	0x30(%rcx),%rcx
18066320Sbholler	jge	L(mov3dqa9)
18076320Sbholler
18086320Sbholler	cmp	$0x10,%r8
18096320Sbholler	jl	L(movdqa_epi)
18106320Sbholler	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
18116320Sbholler	sub	$0x10,%r8
18126320Sbholler	lea	0x10(%rdx),%rdx
18136320Sbholler	movdqa	%xmm3,%xmm2		# save for use next concat
18146320Sbholler	#palignr	$0x9,%xmm1,%xmm3
18156320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
18166320Sbholler	.byte	0xd9,0x09
18176320Sbholler
18186320Sbholler	cmp	$0x10,%r8
18196320Sbholler	movdqa	%xmm3,(%rcx)      	# store it
18206320Sbholler	lea	0x10(%rcx),%rcx
18216320Sbholler	jl	L(movdqa_epi)
18226320Sbholler
18236320Sbholler	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
18246320Sbholler	sub	$0x10,%r8
18256320Sbholler	lea	0x10(%rdx),%rdx
18266320Sbholler	#palignr	$0x9,%xmm2,%xmm0
18276320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
18286320Sbholler	.byte	0xc2,0x09
18296320Sbholler	movdqa	%xmm0,(%rcx)      	# store it
18306320Sbholler	lea	0x10(%rcx),%rcx
18316320Sbholler	jmp	L(movdqa_epi)
18326320Sbholler
18336320Sbholler	.balign 16
18346320SbhollerL(mov3dqa10):
18356320Sbholler	movdqa	0x10(%rdx),%xmm3
18366320Sbholler	sub	$0x30,%r8
18376320Sbholler	movdqa	0x20(%rdx),%xmm0
18386320Sbholler	movdqa	0x30(%rdx),%xmm5
18396320Sbholler	lea	0x30(%rdx),%rdx
18406320Sbholler	cmp	$0x30,%r8
18416320Sbholler
18426320Sbholler	movdqa	%xmm3,%xmm2
18436320Sbholler	#palignr	$0xa,%xmm1,%xmm3
18446320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
18456320Sbholler	.byte	0xd9,0x0a
18466320Sbholler	movdqa	%xmm3,(%rcx)
18476320Sbholler
18486320Sbholler	movdqa	%xmm0,%xmm4
18496320Sbholler	#palignr	$0xa,%xmm2,%xmm0
18506320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
18516320Sbholler	.byte	0xc2,0x0a
18526320Sbholler	movdqa	%xmm0,0x10(%rcx)
18536320Sbholler
18546320Sbholler	movdqa	%xmm5,%xmm1
18556320Sbholler	#palignr	$0xa,%xmm4,%xmm5
18566320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
18576320Sbholler	.byte	0xec,0x0a
18586320Sbholler	movdqa	%xmm5,0x20(%rcx)
18596320Sbholler
18606320Sbholler	lea	0x30(%rcx),%rcx
18616320Sbholler	jge	L(mov3dqa10)
18626320Sbholler
18636320Sbholler	cmp	$0x10,%r8
18646320Sbholler	jl	L(movdqa_epi)
18656320Sbholler	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
18666320Sbholler	sub	$0x10,%r8
18676320Sbholler	lea	0x10(%rdx),%rdx
18686320Sbholler	movdqa	%xmm3,%xmm2		# save for use next concat
18696320Sbholler	#palignr	$0xa,%xmm1,%xmm3
18706320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
18716320Sbholler	.byte	0xd9,0x0a
18726320Sbholler
18736320Sbholler	cmp	$0x10,%r8
18746320Sbholler	movdqa	%xmm3,(%rcx)      	# store it
18756320Sbholler	lea	0x10(%rcx),%rcx
18766320Sbholler	jl	L(movdqa_epi)
18776320Sbholler
18786320Sbholler	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
18796320Sbholler	sub	$0x10,%r8
18806320Sbholler	lea	0x10(%rdx),%rdx
18816320Sbholler	#palignr	$0xa,%xmm2,%xmm0
18826320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
18836320Sbholler	.byte	0xc2,0x0a
18846320Sbholler	movdqa	%xmm0,(%rcx)      	# store it
18856320Sbholler	lea	0x10(%rcx),%rcx
18866320Sbholler	jmp	L(movdqa_epi)
18876320Sbholler
18886320Sbholler	.balign 16
18896320SbhollerL(mov3dqa11):
18906320Sbholler	movdqa	0x10(%rdx),%xmm3
18916320Sbholler	sub	$0x30,%r8
18926320Sbholler	movdqa	0x20(%rdx),%xmm0
18936320Sbholler	movdqa	0x30(%rdx),%xmm5
18946320Sbholler	lea	0x30(%rdx),%rdx
18956320Sbholler	cmp	$0x30,%r8
18966320Sbholler
18976320Sbholler	movdqa	%xmm3,%xmm2
18986320Sbholler	#palignr	$0xb,%xmm1,%xmm3
18996320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
19006320Sbholler	.byte	0xd9,0x0b
19016320Sbholler	movdqa	%xmm3,(%rcx)
19026320Sbholler
19036320Sbholler	movdqa	%xmm0,%xmm4
19046320Sbholler	#palignr	$0xb,%xmm2,%xmm0
19056320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
19066320Sbholler	.byte	0xc2,0x0b
19076320Sbholler	movdqa	%xmm0,0x10(%rcx)
19086320Sbholler
19096320Sbholler	movdqa	%xmm5,%xmm1
19106320Sbholler	#palignr	$0xb,%xmm4,%xmm5
19116320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
19126320Sbholler	.byte	0xec,0x0b
19136320Sbholler	movdqa	%xmm5,0x20(%rcx)
19146320Sbholler
19156320Sbholler	lea	0x30(%rcx),%rcx
19166320Sbholler	jge	L(mov3dqa11)
19176320Sbholler
19186320Sbholler	cmp	$0x10,%r8
19196320Sbholler	jl	L(movdqa_epi)
19206320Sbholler	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
19216320Sbholler	sub	$0x10,%r8
19226320Sbholler	lea	0x10(%rdx),%rdx
19236320Sbholler	movdqa	%xmm3,%xmm2		# save for use next concat
19246320Sbholler	#palignr	$0xb,%xmm1,%xmm3
19256320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
19266320Sbholler	.byte	0xd9,0x0b
19276320Sbholler
19286320Sbholler	cmp	$0x10,%r8
19296320Sbholler	movdqa	%xmm3,(%rcx)      	# store it
19306320Sbholler	lea	0x10(%rcx),%rcx
19316320Sbholler	jl	L(movdqa_epi)
19326320Sbholler
19336320Sbholler	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
19346320Sbholler	sub	$0x10,%r8
19356320Sbholler	lea	0x10(%rdx),%rdx
19366320Sbholler	#palignr	$0xb,%xmm2,%xmm0
19376320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
19386320Sbholler	.byte	0xc2,0x0b
19396320Sbholler	movdqa	%xmm0,(%rcx)      	# store it
19406320Sbholler	lea	0x10(%rcx),%rcx
19416320Sbholler	jmp	L(movdqa_epi)
19426320Sbholler
19436320Sbholler	.balign 16
19446320SbhollerL(mov3dqa12):
19456320Sbholler	movdqa	0x10(%rdx),%xmm3
19466320Sbholler	sub	$0x30,%r8
19476320Sbholler	movdqa	0x20(%rdx),%xmm0
19486320Sbholler	movdqa	0x30(%rdx),%xmm5
19496320Sbholler	lea	0x30(%rdx),%rdx
19506320Sbholler	cmp	$0x30,%r8
19516320Sbholler
19526320Sbholler	movdqa	%xmm3,%xmm2
19536320Sbholler	#palignr	$0xc,%xmm1,%xmm3
19546320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
19556320Sbholler	.byte	0xd9,0x0c
19566320Sbholler	movdqa	%xmm3,(%rcx)
19576320Sbholler
19586320Sbholler	movdqa	%xmm0,%xmm4
19596320Sbholler	#palignr	$0xc,%xmm2,%xmm0
19606320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
19616320Sbholler	.byte	0xc2,0x0c
19626320Sbholler	movdqa	%xmm0,0x10(%rcx)
19636320Sbholler
19646320Sbholler	movdqa	%xmm5,%xmm1
19656320Sbholler	#palignr	$0xc,%xmm4,%xmm5
19666320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
19676320Sbholler	.byte	0xec,0x0c
19686320Sbholler	movdqa	%xmm5,0x20(%rcx)
19696320Sbholler
19706320Sbholler	lea	0x30(%rcx),%rcx
19716320Sbholler	jge	L(mov3dqa12)
19726320Sbholler
19736320Sbholler	cmp	$0x10,%r8
19746320Sbholler	jl	L(movdqa_epi)
19756320Sbholler	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
19766320Sbholler	sub	$0x10,%r8
19776320Sbholler	lea	0x10(%rdx),%rdx
19786320Sbholler	movdqa	%xmm3,%xmm2		# save for use next concat
19796320Sbholler	#palignr	$0xc,%xmm1,%xmm3
19806320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
19816320Sbholler	.byte	0xd9,0x0c
19826320Sbholler
19836320Sbholler	cmp	$0x10,%r8
19846320Sbholler	movdqa	%xmm3,(%rcx)      	# store it
19856320Sbholler	lea	0x10(%rcx),%rcx
19866320Sbholler	jl	L(movdqa_epi)
19876320Sbholler
19886320Sbholler	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
19896320Sbholler	sub	$0x10,%r8
19906320Sbholler	lea	0x10(%rdx),%rdx
19916320Sbholler	#palignr	$0xc,%xmm2,%xmm0
19926320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
19936320Sbholler	.byte	0xc2,0x0c
19946320Sbholler	movdqa	%xmm0,(%rcx)      	# store it
19956320Sbholler	lea	0x10(%rcx),%rcx
19966320Sbholler	jmp	L(movdqa_epi)
19976320Sbholler
19986320Sbholler	.balign 16
19996320SbhollerL(mov3dqa13):
20006320Sbholler	movdqa	0x10(%rdx),%xmm3
20016320Sbholler	sub	$0x30,%r8
20026320Sbholler	movdqa	0x20(%rdx),%xmm0
20036320Sbholler	movdqa	0x30(%rdx),%xmm5
20046320Sbholler	lea	0x30(%rdx),%rdx
20056320Sbholler	cmp	$0x30,%r8
20066320Sbholler
20076320Sbholler	movdqa	%xmm3,%xmm2
20086320Sbholler	#palignr	$0xd,%xmm1,%xmm3
20096320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
20106320Sbholler	.byte	0xd9,0x0d
20116320Sbholler	movdqa	%xmm3,(%rcx)
20126320Sbholler
20136320Sbholler	movdqa	%xmm0,%xmm4
20146320Sbholler	#palignr	$0xd,%xmm2,%xmm0
20156320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
20166320Sbholler	.byte	0xc2,0x0d
20176320Sbholler	movdqa	%xmm0,0x10(%rcx)
20186320Sbholler
20196320Sbholler	movdqa	%xmm5,%xmm1
20206320Sbholler	#palignr	$0xd,%xmm4,%xmm5
20216320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
20226320Sbholler	.byte	0xec,0x0d
20236320Sbholler	movdqa	%xmm5,0x20(%rcx)
20246320Sbholler
20256320Sbholler	lea	0x30(%rcx),%rcx
20266320Sbholler	jge	L(mov3dqa13)
20276320Sbholler
20286320Sbholler	cmp	$0x10,%r8
20296320Sbholler	jl	L(movdqa_epi)
20306320Sbholler	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
20316320Sbholler	sub	$0x10,%r8
20326320Sbholler	lea	0x10(%rdx),%rdx
20336320Sbholler	movdqa	%xmm3,%xmm2		# save for use next concat
20346320Sbholler	#palignr	$0xd,%xmm1,%xmm3
20356320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
20366320Sbholler	.byte	0xd9,0x0d
20376320Sbholler
20386320Sbholler	cmp	$0x10,%r8
20396320Sbholler	movdqa	%xmm3,(%rcx)      	# store it
20406320Sbholler	lea	0x10(%rcx),%rcx
20416320Sbholler	jl	L(movdqa_epi)
20426320Sbholler
20436320Sbholler	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
20446320Sbholler	sub	$0x10,%r8
20456320Sbholler	lea	0x10(%rdx),%rdx
20466320Sbholler	#palignr	$0xd,%xmm2,%xmm0
20476320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
20486320Sbholler	.byte	0xc2,0x0d
20496320Sbholler	movdqa	%xmm0,(%rcx)      	# store it
20506320Sbholler	lea	0x10(%rcx),%rcx
20516320Sbholler	jmp	L(movdqa_epi)
20526320Sbholler
20536320Sbholler	.balign 16
20546320SbhollerL(mov3dqa14):
20556320Sbholler	movdqa	0x10(%rdx),%xmm3
20566320Sbholler	sub	$0x30,%r8
20576320Sbholler	movdqa	0x20(%rdx),%xmm0
20586320Sbholler	movdqa	0x30(%rdx),%xmm5
20596320Sbholler	lea	0x30(%rdx),%rdx
20606320Sbholler	cmp	$0x30,%r8
20616320Sbholler
20626320Sbholler	movdqa	%xmm3,%xmm2
20636320Sbholler	#palignr	$0xe,%xmm1,%xmm3
20646320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
20656320Sbholler	.byte	0xd9,0x0e
20666320Sbholler	movdqa	%xmm3,(%rcx)
20676320Sbholler
20686320Sbholler	movdqa	%xmm0,%xmm4
20696320Sbholler	#palignr	$0xe,%xmm2,%xmm0
20706320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
20716320Sbholler	.byte	0xc2,0x0e
20726320Sbholler	movdqa	%xmm0,0x10(%rcx)
20736320Sbholler
20746320Sbholler	movdqa	%xmm5,%xmm1
20756320Sbholler	#palignr	$0xe,%xmm4,%xmm5
20766320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
20776320Sbholler	.byte	0xec,0x0e
20786320Sbholler	movdqa	%xmm5,0x20(%rcx)
20796320Sbholler
20806320Sbholler	lea	0x30(%rcx),%rcx
20816320Sbholler	jge	L(mov3dqa14)
20826320Sbholler
20836320Sbholler	cmp	$0x10,%r8
20846320Sbholler	jl	L(movdqa_epi)
20856320Sbholler	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
20866320Sbholler	sub	$0x10,%r8
20876320Sbholler	lea	0x10(%rdx),%rdx
20886320Sbholler	movdqa	%xmm3,%xmm2		# save for use next concat
20896320Sbholler	#palignr	$0xe,%xmm1,%xmm3
20906320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
20916320Sbholler	.byte	0xd9,0x0e
20926320Sbholler
20936320Sbholler	cmp	$0x10,%r8
20946320Sbholler	movdqa	%xmm3,(%rcx)      	# store it
20956320Sbholler	lea	0x10(%rcx),%rcx
20966320Sbholler	jl	L(movdqa_epi)
20976320Sbholler
20986320Sbholler	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
20996320Sbholler	sub	$0x10,%r8
21006320Sbholler	lea	0x10(%rdx),%rdx
21016320Sbholler	#palignr	$0xe,%xmm2,%xmm0
21026320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
21036320Sbholler	.byte	0xc2,0x0e
21046320Sbholler	movdqa	%xmm0,(%rcx)      	# store it
21056320Sbholler	lea	0x10(%rcx),%rcx
21066320Sbholler	jmp	L(movdqa_epi)
21076320Sbholler
21086320Sbholler	.balign 16
21096320SbhollerL(mov3dqa15):
21106320Sbholler	movdqa	0x10(%rdx),%xmm3
21116320Sbholler	sub	$0x30,%r8
21126320Sbholler	movdqa	0x20(%rdx),%xmm0
21136320Sbholler	movdqa	0x30(%rdx),%xmm5
21146320Sbholler	lea	0x30(%rdx),%rdx
21156320Sbholler	cmp	$0x30,%r8
21166320Sbholler
21176320Sbholler	movdqa	%xmm3,%xmm2
21186320Sbholler	#palignr	$0xf,%xmm1,%xmm3
21196320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
21206320Sbholler	.byte	0xd9,0x0f
21216320Sbholler	movdqa	%xmm3,(%rcx)
21226320Sbholler
21236320Sbholler	movdqa	%xmm0,%xmm4
21246320Sbholler	#palignr	$0xf,%xmm2,%xmm0
21256320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
21266320Sbholler	.byte	0xc2,0x0f
21276320Sbholler	movdqa	%xmm0,0x10(%rcx)
21286320Sbholler
21296320Sbholler	movdqa	%xmm5,%xmm1
21306320Sbholler	#palignr	$0xf,%xmm4,%xmm5
21316320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
21326320Sbholler	.byte	0xec,0x0f
21336320Sbholler	movdqa	%xmm5,0x20(%rcx)
21346320Sbholler
21356320Sbholler	lea	0x30(%rcx),%rcx
21366320Sbholler	jge	L(mov3dqa15)
21376320Sbholler
21386320Sbholler	cmp	$0x10,%r8
21396320Sbholler	jl	L(movdqa_epi)
21406320Sbholler	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
21416320Sbholler	sub	$0x10,%r8
21426320Sbholler	lea	0x10(%rdx),%rdx
21436320Sbholler	movdqa	%xmm3,%xmm2		# save for use next concat
21446320Sbholler	#palignr	$0xf,%xmm1,%xmm3
21456320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
21466320Sbholler	.byte	0xd9,0x0f
21476320Sbholler
21486320Sbholler	cmp	$0x10,%r8
21496320Sbholler	movdqa	%xmm3,(%rcx)      	# store it
21506320Sbholler	lea	0x10(%rcx),%rcx
21516320Sbholler	jl	L(movdqa_epi)
21526320Sbholler
21536320Sbholler	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
21546320Sbholler	sub	$0x10,%r8
21556320Sbholler	lea	0x10(%rdx),%rdx
21566320Sbholler	#palignr	$0xf,%xmm2,%xmm0
21576320Sbholler	.byte	0x66,0x0f,0x3a,0x0f
21586320Sbholler	.byte	0xc2,0x0f
21596320Sbholler	movdqa	%xmm0,(%rcx)      	# store it
21606320Sbholler	lea	0x10(%rcx),%rcx
21616320Sbholler	jmp	L(movdqa_epi)
21626320Sbholler
21636320Sbholler	.balign 16
21646320SbhollerL(sse2_nt_move):
21656320Sbholler	lea	0x40(%rcx),%rcx
21666320Sbholler	lea	0x40(%rdx),%rdx
21676320Sbholler	lea	-0x40(%r8),%r8
21686320Sbholler
21696320Sbholler	/*
21706320Sbholler	 * doesn't matter if source is aligned for stuff out of cache.
21716320Sbholler	 * the mis-aligned penalty is masked by the slowness of main memory.
21726320Sbholler	 */
21736320Sbholler	prefetchnta 0x180(%rdx)
21746320Sbholler	movdqu	-0x40(%rdx),%xmm0
21756320Sbholler	movdqu	-0x30(%rdx),%xmm1
21766320Sbholler
21776320Sbholler	cmp	$0x40,%r8
21786320Sbholler	movntdq	%xmm0,-0x40(%rcx)
21796320Sbholler	movntdq	%xmm1,-0x30(%rcx)
21806320Sbholler
21816320Sbholler	movdqu	-0x20(%rdx),%xmm2
21826320Sbholler	movdqu	-0x10(%rdx),%xmm3
21836320Sbholler
21846320Sbholler	movntdq	%xmm2,-0x20(%rcx)
21856320Sbholler	movntdq	%xmm3,-0x10(%rcx)
21866320Sbholler
21876320Sbholler	jge	L(sse2_nt_move)
21886320Sbholler
21896320Sbholler	lea	L(Fix16EndTable)(%rip),%r10
21906320Sbholler	mov	%r8,%r9
21916320Sbholler	and	$0xFFFFFFFFFFFFFFF0,%r9
21926320Sbholler	add	%r9,%rcx
21936320Sbholler	add	%r9,%rdx
21946320Sbholler	sub	%r9,%r8
21956320Sbholler	shr	$0x4,%r9
21966320Sbholler	sfence
21976320Sbholler
21986320Sbholler	movslq	(%r10,%r9,4),%r11
21996320Sbholler	lea	(%r11,%r10,1),%r10
22006320Sbholler	jmpq	*%r10
22016320Sbholler
22026320Sbholler	.balign 16
22036320SbhollerL(Fix16EndTable):
22046320Sbholler	.int    L(fix16_0)-L(Fix16EndTable)
22056320Sbholler	.int    L(fix16_1)-L(Fix16EndTable)
22066320Sbholler	.int    L(fix16_2)-L(Fix16EndTable)
22076320Sbholler	.int    L(fix16_3)-L(Fix16EndTable)
22086320Sbholler
22096320Sbholler	.balign 16
22106320SbhollerL(fix16_3):
22116320Sbholler	movdqu -0x30(%rdx),%xmm1
22126320Sbholler	movdqa %xmm1,-0x30(%rcx)
22136320SbhollerL(fix16_2):
22146320Sbholler	movdqu -0x20(%rdx),%xmm2
22156320Sbholler	movdqa %xmm2,-0x20(%rcx)
22166320SbhollerL(fix16_1):
22176320Sbholler	movdqu -0x10(%rdx),%xmm3
22186320Sbholler	movdqa %xmm3,-0x10(%rcx)
22196320SbhollerL(fix16_0):
22206320Sbholler	lea    L(fwdPxQx)(%rip),%r10
22216320Sbholler	add    %r8,%rdx
22226320Sbholler	add    %r8,%rcx
22236320Sbholler
22246320Sbholler	movslq (%r10,%r8,4),%r9
22256320Sbholler	lea    (%r9,%r10,1),%r10
22266320Sbholler	jmpq   *%r10
22276320Sbholler
22286320Sbholler	.balign 16
22296320SbhollerL(pre_both_aligned):
22306320Sbholler	cmp    $0x80,%r8
22316320Sbholler	jl     L(fix_16b)
22326320Sbholler
22336320Sbholler	.balign 16
22346320SbhollerL(both_aligned):
22356320Sbholler
22366320Sbholler	/*
22376320Sbholler	 * this 'paired' load/load/store/store seems to do best.
22386320Sbholler	 */
22396320Sbholler	movdqa (%rdx),%xmm0
22406320Sbholler	movdqa 0x10(%rdx),%xmm1
22416320Sbholler
22426320Sbholler	movdqa %xmm0,(%rcx)
22436320Sbholler	movdqa %xmm1,0x10(%rcx)
22446320Sbholler	lea    -0x80(%r8),%r8
22456320Sbholler
22466320Sbholler	movdqa 0x20(%rdx),%xmm2
22476320Sbholler	movdqa 0x30(%rdx),%xmm3
22486320Sbholler
22496320Sbholler	movdqa %xmm2,0x20(%rcx)
22506320Sbholler	movdqa %xmm3,0x30(%rcx)
22516320Sbholler
22526320Sbholler	movdqa 0x40(%rdx),%xmm0
22536320Sbholler	movdqa 0x50(%rdx),%xmm1
22546320Sbholler	cmp    $0x80,%r8
22556320Sbholler
22566320Sbholler	movdqa %xmm0,0x40(%rcx)
22576320Sbholler	movdqa %xmm1,0x50(%rcx)
22586320Sbholler
22596320Sbholler	movdqa 0x60(%rdx),%xmm2
22606320Sbholler	movdqa 0x70(%rdx),%xmm3
22616320Sbholler	lea    0x80(%rdx),%rdx
22626320Sbholler	movdqa %xmm2,0x60(%rcx)
22636320Sbholler	movdqa %xmm3,0x70(%rcx)
22646320Sbholler	lea    0x80(%rcx),%rcx
22656320Sbholler	jge    L(both_aligned)
22666320Sbholler
22676320SbhollerL(fix_16b):
22686320Sbholler	add    %r8,%rcx
22696320Sbholler	lea    L(fwdPxQx)(%rip),%r10
22706320Sbholler	add    %r8,%rdx
22716320Sbholler
22726320Sbholler	movslq (%r10,%r8,4),%r9
22736320Sbholler	lea    (%r9,%r10,1),%r10
22746320Sbholler	jmpq   *%r10
22756320Sbholler
22766320Sbholler	.balign 16
22776320SbhollerL(Loop8byte_pre):
22786320Sbholler	# Use 8-byte moves
22796320Sbholler	mov    .largest_level_cache_size(%rip),%r9d
22806320Sbholler	shr    %r9		# take half of it
22816320Sbholler	cmp    %r9,%r8
2282*10024Sbostrovs	jge    L(byte8_nt_top)
22836320Sbholler	# Find out whether to use rep movsq
22846320Sbholler	cmp    $4096,%r8
22856320Sbholler	jle    L(byte8_top)
22866320Sbholler	mov    .amd64cache1half(%rip),%r9d	# half of l1 cache
22876320Sbholler	cmp    %r9,%r8
22886320Sbholler	jle    L(use_rep)
22896320Sbholler
22906320Sbholler	.balign     16
22916320SbhollerL(byte8_top):
22926320Sbholler	mov    (%rdx),%r9
22936320Sbholler	mov    0x8(%rdx),%r10
22946320Sbholler	lea    -0x40(%r8),%r8
22956320Sbholler	mov    %r9,(%rcx)
22966320Sbholler	mov    %r10,0x8(%rcx)
22976320Sbholler	mov    0x10(%rdx),%r11
22986320Sbholler	mov    0x18(%rdx),%r9
22996320Sbholler	mov    %r11,0x10(%rcx)
23006320Sbholler	mov    %r9,0x18(%rcx)
23016320Sbholler
23026320Sbholler	cmp    $0x40,%r8
23036320Sbholler	mov    0x20(%rdx),%r10
23046320Sbholler	mov    0x28(%rdx),%r11
23056320Sbholler	mov    %r10,0x20(%rcx)
23066320Sbholler	mov    %r11,0x28(%rcx)
23076320Sbholler	mov    0x30(%rdx),%r9
23086320Sbholler	mov    0x38(%rdx),%r10
23096320Sbholler	lea    0x40(%rdx),%rdx
23106320Sbholler	mov    %r9,0x30(%rcx)
23116320Sbholler	mov    %r10,0x38(%rcx)
23126320Sbholler	lea    0x40(%rcx),%rcx
23136320Sbholler	jg     L(byte8_top)
23146320Sbholler
23156320SbhollerL(byte8_end):
23166320Sbholler	lea    L(fwdPxQx)(%rip),%r10
23176320Sbholler	lea    (%rdx,%r8,1),%rdx
23186320Sbholler	lea    (%rcx,%r8,1),%rcx
23196320Sbholler
23206320Sbholler	movslq (%r10,%r8,4),%r9
23216320Sbholler	lea    (%r9,%r10,1),%r10
23226320Sbholler	jmpq   *%r10
23236320Sbholler
23246320Sbholler	.balign	16
23256320SbhollerL(use_rep):
23266320Sbholler	mov    %rdx,%rsi		# %rsi = source
23276320Sbholler	mov    %rcx,%rdi		# %rdi = destination
23286320Sbholler	mov    %r8,%rcx			# %rcx = count
23296320Sbholler	shrq   $3,%rcx			# 8-byte word count
23300Sstevel@tonic-gate	rep
23316320Sbholler	  movsq
23326320Sbholler	mov    %rsi,%rdx		# source
23336320Sbholler	mov    %rdi,%rcx		# destination
23346320Sbholler	andq   $7,%r8			# remainder
23356320Sbholler	jnz    L(byte8_end)
23360Sstevel@tonic-gate	ret
23370Sstevel@tonic-gate
23386320Sbholler	.balign 16
23396320SbhollerL(byte8_nt_top):
23406320Sbholler	sub    $0x40,%r8
23416320Sbholler	prefetchnta 0x180(%rdx)
23426320Sbholler	mov    (%rdx),%r9
23436320Sbholler	movnti %r9,(%rcx)
23446320Sbholler	mov    0x8(%rdx),%r10
23456320Sbholler	movnti %r10,0x8(%rcx)
23466320Sbholler	mov    0x10(%rdx),%r11
23476320Sbholler	movnti %r11,0x10(%rcx)
23486320Sbholler	mov    0x18(%rdx),%r9
23496320Sbholler	movnti %r9,0x18(%rcx)
23506320Sbholler	mov    0x20(%rdx),%r10
23516320Sbholler	movnti %r10,0x20(%rcx)
23526320Sbholler	mov    0x28(%rdx),%r11
23536320Sbholler	movnti %r11,0x28(%rcx)
23546320Sbholler	mov    0x30(%rdx),%r9
23556320Sbholler	movnti %r9,0x30(%rcx)
23566320Sbholler	mov    0x38(%rdx),%r10
23576320Sbholler	movnti %r10,0x38(%rcx)
23586320Sbholler
23596320Sbholler	lea    0x40(%rdx),%rdx
23606320Sbholler	lea    0x40(%rcx),%rcx
23616320Sbholler	cmp    $0x40,%r8
23626320Sbholler	jge    L(byte8_nt_top)
23636320Sbholler	sfence
23646320Sbholler	jmp    L(byte8_end)
23656320Sbholler
23666320Sbholler	SET_SIZE(memcpy)
23676320Sbholler
23686320Sbholler	.balign 16
23696320SbhollerL(CopyBackwards):
23706320Sbholler	mov    %rdx,%r8
23716320Sbholler	mov    %rdi,%rcx
23726320Sbholler	mov    %rsi,%rdx
23736320Sbholler	mov    %rdi,%rax		# return value
23746320Sbholler
23756320Sbholler	# ck alignment of last byte
23766320Sbholler	lea    (%rcx,%r8,1),%rcx
23776320Sbholler	test   $0x7,%rcx
23786320Sbholler	lea    (%rdx,%r8,1),%rdx
23796320Sbholler	jne    L(bk_align)
23806320Sbholler
23816320SbhollerL(bk_qw_aligned):
23826320Sbholler	lea    L(bkPxQx)(%rip),%r10
23836320Sbholler
23846320Sbholler	cmp    $0x90,%r8		# 144
23856320Sbholler	jg     L(bk_ck_sse2_alignment)
23866320Sbholler
23876320Sbholler	sub    %r8,%rcx
23886320Sbholler	sub    %r8,%rdx
23896320Sbholler
23906320Sbholler	movslq (%r10,%r8,4),%r9
23916320Sbholler	lea    (%r9,%r10,1),%r10
23926320Sbholler	jmpq   *%r10
23936320Sbholler
23946320Sbholler	.balign 16
23956320SbhollerL(bk_align):
23966320Sbholler	# only align if len > 8
23976320Sbholler	cmp    $8,%r8
23986320Sbholler	jle    L(bk_qw_aligned)
23996320Sbholler	test   $0x1,%rcx
24006320Sbholler	je     L(bk_tst2)
24016320Sbholler	dec    %rcx
24026320Sbholler	dec    %rdx
24036320Sbholler	dec    %r8
24046320Sbholler	mov    (%rdx),%r9b
24056320Sbholler	mov    %r9b,(%rcx)
24066320Sbholler
24076320SbhollerL(bk_tst2):
24086320Sbholler	test   $0x2,%rcx
24096320Sbholler	je     L(bk_tst3)
24106320Sbholler
24116320SbhollerL(bk_got2):
24126320Sbholler	sub    $0x2,%rcx
24136320Sbholler	sub    $0x2,%rdx
24146320Sbholler	sub    $0x2,%r8
24156320Sbholler	movzwq (%rdx),%r9
24166320Sbholler	mov    %r9w,(%rcx)
24176320Sbholler
24186320SbhollerL(bk_tst3):
24196320Sbholler	test   $0x4,%rcx
24206320Sbholler	je     L(bk_qw_aligned)
24216320Sbholler
24226320SbhollerL(bk_got3):
24236320Sbholler	sub    $0x4,%rcx
24246320Sbholler	sub    $0x4,%rdx
24256320Sbholler	sub    $0x4,%r8
24266320Sbholler	mov    (%rdx),%r9d
24276320Sbholler	mov    %r9d,(%rcx)
24286320Sbholler	jmp    L(bk_qw_aligned)
24296320Sbholler
24306320Sbholler	.balign 16
24316320SbhollerL(bk_ck_sse2_alignment):
24326320Sbholler	cmpl   $NO_SSE,.memops_method(%rip)
24336320Sbholler	je     L(bk_use_rep)
24346320Sbholler	# check alignment of last byte
24356320Sbholler	test   $0xf,%rcx
24366320Sbholler	jz     L(bk_sse2_cpy)
24376320Sbholler
24386320SbhollerL(bk_sse2_align):
24396320Sbholler	# only here if already aligned on at least a qword bndry
24406320Sbholler	sub    $0x8,%rcx
24416320Sbholler	sub    $0x8,%rdx
24426320Sbholler	sub    $0x8,%r8
24436320Sbholler	mov    (%rdx),%r9
24446320Sbholler	mov    %r9,(%rcx)
24456320Sbholler	#jmp   L(bk_sse2_cpy)
24466320Sbholler
24476320Sbholler	.balign 16
24486320SbhollerL(bk_sse2_cpy):
24496320Sbholler	sub    $0x80,%rcx		# 128
24506320Sbholler	sub    $0x80,%rdx
24516320Sbholler	movdqu 0x70(%rdx),%xmm3
24526320Sbholler	movdqu 0x60(%rdx),%xmm2
24536320Sbholler	movdqa %xmm3,0x70(%rcx)
24546320Sbholler	movdqa %xmm2,0x60(%rcx)
24556320Sbholler	sub    $0x80,%r8
24566320Sbholler	movdqu 0x50(%rdx),%xmm1
24576320Sbholler	movdqu 0x40(%rdx),%xmm0
24586320Sbholler	movdqa %xmm1,0x50(%rcx)
24596320Sbholler	movdqa %xmm0,0x40(%rcx)
24606320Sbholler
24616320Sbholler	cmp    $0x80,%r8
24626320Sbholler	movdqu 0x30(%rdx),%xmm3
24636320Sbholler	movdqu 0x20(%rdx),%xmm2
24646320Sbholler	movdqa %xmm3,0x30(%rcx)
24656320Sbholler	movdqa %xmm2,0x20(%rcx)
24666320Sbholler	movdqu 0x10(%rdx),%xmm1
24676320Sbholler	movdqu (%rdx),%xmm0
24686320Sbholler	movdqa %xmm1,0x10(%rcx)
24696320Sbholler	movdqa %xmm0,(%rcx)
24706320Sbholler	jge    L(bk_sse2_cpy)
24716320Sbholler
24726320SbhollerL(bk_sse2_cpy_end):
24736320Sbholler	lea    L(bkPxQx)(%rip),%r10
24746320Sbholler	sub    %r8,%rdx
24756320Sbholler	sub    %r8,%rcx
24766320Sbholler	movslq (%r10,%r8,4),%r9
24776320Sbholler	lea    (%r9,%r10,1),%r10
24786320Sbholler	jmpq   *%r10
24796320Sbholler
24806320Sbholler	.balign 16
24816320SbhollerL(bk_use_rep):
24826320Sbholler	xchg   %rcx,%r9
24836320Sbholler	mov    %rdx,%rsi		# source
24846320Sbholler	mov    %r9,%rdi			# destination
24856320Sbholler	mov    %r8,%rcx			# count
24866320Sbholler	sub    $8,%rsi
24876320Sbholler	sub    $8,%rdi
24886320Sbholler	shr    $3,%rcx
24896320Sbholler	std				# reverse direction
24906320Sbholler	rep
24916320Sbholler	  movsq
24926320Sbholler	cld				# reset direction flag
24936320Sbholler
24946320Sbholler	xchg   %rcx,%r9
24956320Sbholler	lea    L(bkPxQx)(%rip),%r10
24966320Sbholler	sub    %r8,%rdx
24976320Sbholler	sub    %r8,%rcx
24986320Sbholler	andq   $7,%r8			# remainder
24996320Sbholler	jz     2f
25006320Sbholler	movslq (%r10,%r8,4),%r9
25016320Sbholler	lea    (%r9,%r10,1),%r10
25026320Sbholler	jmpq   *%r10
25036320Sbholler2:
25046320Sbholler	ret
25056320Sbholler
25066320Sbholler	.balign 16
25076320SbhollerL(bkP0QI):
25086320Sbholler	mov    0x88(%rdx),%r10
25096320Sbholler	mov    %r10,0x88(%rcx)
25106320SbhollerL(bkP0QH):
25116320Sbholler	mov    0x80(%rdx),%r10
25126320Sbholler	mov    %r10,0x80(%rcx)
25136320SbhollerL(bkP0QG):
25146320Sbholler	mov    0x78(%rdx),%r9
25156320Sbholler	mov    %r9,0x78(%rcx)
25166320SbhollerL(bkP0QF):
25176320Sbholler	mov    0x70(%rdx),%r11
25186320Sbholler	mov    %r11,0x70(%rcx)
25196320SbhollerL(bkP0QE):
25206320Sbholler	mov    0x68(%rdx),%r10
25216320Sbholler	mov    %r10,0x68(%rcx)
25226320SbhollerL(bkP0QD):
25236320Sbholler	mov    0x60(%rdx),%r9
25246320Sbholler	mov    %r9,0x60(%rcx)
25256320SbhollerL(bkP0QC):
25266320Sbholler	mov    0x58(%rdx),%r11
25276320Sbholler	mov    %r11,0x58(%rcx)
25286320SbhollerL(bkP0QB):
25296320Sbholler	mov    0x50(%rdx),%r10
25306320Sbholler	mov    %r10,0x50(%rcx)
25316320SbhollerL(bkP0QA):
25326320Sbholler	mov    0x48(%rdx),%r9
25336320Sbholler	mov    %r9,0x48(%rcx)
25346320SbhollerL(bkP0Q9):
25356320Sbholler	mov    0x40(%rdx),%r11
25366320Sbholler	mov    %r11,0x40(%rcx)
25376320SbhollerL(bkP0Q8):
25386320Sbholler	mov    0x38(%rdx),%r10
25396320Sbholler	mov    %r10,0x38(%rcx)
25406320SbhollerL(bkP0Q7):
25416320Sbholler	mov    0x30(%rdx),%r9
25426320Sbholler	mov    %r9,0x30(%rcx)
25436320SbhollerL(bkP0Q6):
25446320Sbholler	mov    0x28(%rdx),%r11
25456320Sbholler	mov    %r11,0x28(%rcx)
25466320SbhollerL(bkP0Q5):
25476320Sbholler	mov    0x20(%rdx),%r10
25486320Sbholler	mov    %r10,0x20(%rcx)
25496320SbhollerL(bkP0Q4):
25506320Sbholler	mov    0x18(%rdx),%r9
25516320Sbholler	mov    %r9,0x18(%rcx)
25526320SbhollerL(bkP0Q3):
25536320Sbholler	mov    0x10(%rdx),%r11
25546320Sbholler	mov    %r11,0x10(%rcx)
25556320SbhollerL(bkP0Q2):
25566320Sbholler	mov    0x8(%rdx),%r10
25576320Sbholler	mov    %r10,0x8(%rcx)
25586320SbhollerL(bkP0Q1):
25596320Sbholler	mov    (%rdx),%r9
25606320Sbholler	mov    %r9,(%rcx)
25616320SbhollerL(bkP0Q0):
25626320Sbholler	ret
25636320Sbholler
25646320Sbholler	.balign 16
25656320SbhollerL(bkP1QI):
25666320Sbholler	mov    0x89(%rdx),%r10
25676320Sbholler	mov    %r10,0x89(%rcx)
25686320SbhollerL(bkP1QH):
25696320Sbholler	mov    0x81(%rdx),%r11
25706320Sbholler	mov    %r11,0x81(%rcx)
25716320SbhollerL(bkP1QG):
25726320Sbholler	mov    0x79(%rdx),%r10
25736320Sbholler	mov    %r10,0x79(%rcx)
25746320SbhollerL(bkP1QF):
25756320Sbholler	mov    0x71(%rdx),%r9
25766320Sbholler	mov    %r9,0x71(%rcx)
25776320SbhollerL(bkP1QE):
25786320Sbholler	mov    0x69(%rdx),%r11
25796320Sbholler	mov    %r11,0x69(%rcx)
25806320SbhollerL(bkP1QD):
25816320Sbholler	mov    0x61(%rdx),%r10
25826320Sbholler	mov    %r10,0x61(%rcx)
25836320SbhollerL(bkP1QC):
25846320Sbholler	mov    0x59(%rdx),%r9
25856320Sbholler	mov    %r9,0x59(%rcx)
25866320SbhollerL(bkP1QB):
25876320Sbholler	mov    0x51(%rdx),%r11
25886320Sbholler	mov    %r11,0x51(%rcx)
25896320SbhollerL(bkP1QA):
25906320Sbholler	mov    0x49(%rdx),%r10
25916320Sbholler	mov    %r10,0x49(%rcx)
25926320SbhollerL(bkP1Q9):
25936320Sbholler	mov    0x41(%rdx),%r9
25946320Sbholler	mov    %r9,0x41(%rcx)
25956320SbhollerL(bkP1Q8):
25966320Sbholler	mov    0x39(%rdx),%r11
25976320Sbholler	mov    %r11,0x39(%rcx)
25986320SbhollerL(bkP1Q7):
25996320Sbholler	mov    0x31(%rdx),%r10
26006320Sbholler	mov    %r10,0x31(%rcx)
26016320SbhollerL(bkP1Q6):
26026320Sbholler	mov    0x29(%rdx),%r9
26036320Sbholler	mov    %r9,0x29(%rcx)
26046320SbhollerL(bkP1Q5):
26056320Sbholler	mov    0x21(%rdx),%r11
26066320Sbholler	mov    %r11,0x21(%rcx)
26076320SbhollerL(bkP1Q4):
26086320Sbholler	mov    0x19(%rdx),%r10
26096320Sbholler	mov    %r10,0x19(%rcx)
26106320SbhollerL(bkP1Q3):
26116320Sbholler	mov    0x11(%rdx),%r9
26126320Sbholler	mov    %r9,0x11(%rcx)
26136320SbhollerL(bkP1Q2):
26146320Sbholler	mov    0x9(%rdx),%r11
26156320Sbholler	mov    %r11,0x9(%rcx)
26166320SbhollerL(bkP1Q1):
26176320Sbholler	mov    0x1(%rdx),%r10
26186320Sbholler	mov    %r10,0x1(%rcx)
26196320SbhollerL(bkP1Q0):
26206320Sbholler	mov    (%rdx),%r9b
26216320Sbholler	mov    %r9b,(%rcx)
26226320Sbholler	ret
26236320Sbholler
26246320Sbholler	.balign 16
26256320SbhollerL(bkP2QI):
26266320Sbholler	mov    0x8a(%rdx),%r10
26276320Sbholler	mov    %r10,0x8a(%rcx)
26286320SbhollerL(bkP2QH):
26296320Sbholler	mov    0x82(%rdx),%r11
26306320Sbholler	mov    %r11,0x82(%rcx)
26316320SbhollerL(bkP2QG):
26326320Sbholler	mov    0x7a(%rdx),%r10
26336320Sbholler	mov    %r10,0x7a(%rcx)
26346320SbhollerL(bkP2QF):
26356320Sbholler	mov    0x72(%rdx),%r9
26366320Sbholler	mov    %r9,0x72(%rcx)
26376320SbhollerL(bkP2QE):
26386320Sbholler	mov    0x6a(%rdx),%r11
26396320Sbholler	mov    %r11,0x6a(%rcx)
26406320SbhollerL(bkP2QD):
26416320Sbholler	mov    0x62(%rdx),%r10
26426320Sbholler	mov    %r10,0x62(%rcx)
26436320SbhollerL(bkP2QC):
26446320Sbholler	mov    0x5a(%rdx),%r9
26456320Sbholler	mov    %r9,0x5a(%rcx)
26466320SbhollerL(bkP2QB):
26476320Sbholler	mov    0x52(%rdx),%r11
26486320Sbholler	mov    %r11,0x52(%rcx)
26496320SbhollerL(bkP2QA):
26506320Sbholler	mov    0x4a(%rdx),%r10
26516320Sbholler	mov    %r10,0x4a(%rcx)
26526320SbhollerL(bkP2Q9):
26536320Sbholler	mov    0x42(%rdx),%r9
26546320Sbholler	mov    %r9,0x42(%rcx)
26556320SbhollerL(bkP2Q8):
26566320Sbholler	mov    0x3a(%rdx),%r11
26576320Sbholler	mov    %r11,0x3a(%rcx)
26586320SbhollerL(bkP2Q7):
26596320Sbholler	mov    0x32(%rdx),%r10
26606320Sbholler	mov    %r10,0x32(%rcx)
26616320SbhollerL(bkP2Q6):
26626320Sbholler	mov    0x2a(%rdx),%r9
26636320Sbholler	mov    %r9,0x2a(%rcx)
26646320SbhollerL(bkP2Q5):
26656320Sbholler	mov    0x22(%rdx),%r11
26666320Sbholler	mov    %r11,0x22(%rcx)
26676320SbhollerL(bkP2Q4):
26686320Sbholler	mov    0x1a(%rdx),%r10
26696320Sbholler	mov    %r10,0x1a(%rcx)
26706320SbhollerL(bkP2Q3):
26716320Sbholler	mov    0x12(%rdx),%r9
26726320Sbholler	mov    %r9,0x12(%rcx)
26736320SbhollerL(bkP2Q2):
26746320Sbholler	mov    0xa(%rdx),%r11
26756320Sbholler	mov    %r11,0xa(%rcx)
26766320SbhollerL(bkP2Q1):
26776320Sbholler	mov    0x2(%rdx),%r10
26786320Sbholler	mov    %r10,0x2(%rcx)
26796320SbhollerL(bkP2Q0):
26806320Sbholler	mov    (%rdx),%r9w
26816320Sbholler	mov    %r9w,(%rcx)
26826320Sbholler	ret
26836320Sbholler
26846320Sbholler	.balign 16
26856320SbhollerL(bkP3QI):
26866320Sbholler	mov    0x8b(%rdx),%r10
26876320Sbholler	mov    %r10,0x8b(%rcx)
26886320SbhollerL(bkP3QH):
26896320Sbholler	mov    0x83(%rdx),%r11
26906320Sbholler	mov    %r11,0x83(%rcx)
26916320SbhollerL(bkP3QG):
26926320Sbholler	mov    0x7b(%rdx),%r10
26936320Sbholler	mov    %r10,0x7b(%rcx)
26946320SbhollerL(bkP3QF):
26956320Sbholler	mov    0x73(%rdx),%r9
26966320Sbholler	mov    %r9,0x73(%rcx)
26976320SbhollerL(bkP3QE):
26986320Sbholler	mov    0x6b(%rdx),%r11
26996320Sbholler	mov    %r11,0x6b(%rcx)
27006320SbhollerL(bkP3QD):
27016320Sbholler	mov    0x63(%rdx),%r10
27026320Sbholler	mov    %r10,0x63(%rcx)
27036320SbhollerL(bkP3QC):
27046320Sbholler	mov    0x5b(%rdx),%r9
27056320Sbholler	mov    %r9,0x5b(%rcx)
27066320SbhollerL(bkP3QB):
27076320Sbholler	mov    0x53(%rdx),%r11
27086320Sbholler	mov    %r11,0x53(%rcx)
27096320SbhollerL(bkP3QA):
27106320Sbholler	mov    0x4b(%rdx),%r10
27116320Sbholler	mov    %r10,0x4b(%rcx)
27126320SbhollerL(bkP3Q9):
27136320Sbholler	mov    0x43(%rdx),%r9
27146320Sbholler	mov    %r9,0x43(%rcx)
27156320SbhollerL(bkP3Q8):
27166320Sbholler	mov    0x3b(%rdx),%r11
27176320Sbholler	mov    %r11,0x3b(%rcx)
27186320SbhollerL(bkP3Q7):
27196320Sbholler	mov    0x33(%rdx),%r10
27206320Sbholler	mov    %r10,0x33(%rcx)
27216320SbhollerL(bkP3Q6):
27226320Sbholler	mov    0x2b(%rdx),%r9
27236320Sbholler	mov    %r9,0x2b(%rcx)
27246320SbhollerL(bkP3Q5):
27256320Sbholler	mov    0x23(%rdx),%r11
27266320Sbholler	mov    %r11,0x23(%rcx)
27276320SbhollerL(bkP3Q4):
27286320Sbholler	mov    0x1b(%rdx),%r10
27296320Sbholler	mov    %r10,0x1b(%rcx)
27306320SbhollerL(bkP3Q3):
27316320Sbholler	mov    0x13(%rdx),%r9
27326320Sbholler	mov    %r9,0x13(%rcx)
27336320SbhollerL(bkP3Q2):
27346320Sbholler	mov    0xb(%rdx),%r11
27356320Sbholler	mov    %r11,0xb(%rcx)
27366320SbhollerL(bkP3Q1):
27376320Sbholler	mov    0x3(%rdx),%r10
27386320Sbholler	mov    %r10,0x3(%rcx)
27396320SbhollerL(bkP3Q0): # trailing loads/stores do all their loads 1st, then do the stores
27406320Sbholler	mov    0x1(%rdx),%r9w
27416320Sbholler	mov    %r9w,0x1(%rcx)
27426320Sbholler	mov    (%rdx),%r10b
27436320Sbholler	mov    %r10b,(%rcx)
27446320Sbholler	ret
27456320Sbholler
27466320Sbholler	.balign 16
27476320SbhollerL(bkP4QI):
27486320Sbholler	mov    0x8c(%rdx),%r10
27496320Sbholler	mov    %r10,0x8c(%rcx)
27506320SbhollerL(bkP4QH):
27516320Sbholler	mov    0x84(%rdx),%r11
27526320Sbholler	mov    %r11,0x84(%rcx)
27536320SbhollerL(bkP4QG):
27546320Sbholler	mov    0x7c(%rdx),%r10
27556320Sbholler	mov    %r10,0x7c(%rcx)
27566320SbhollerL(bkP4QF):
27576320Sbholler	mov    0x74(%rdx),%r9
27586320Sbholler	mov    %r9,0x74(%rcx)
27596320SbhollerL(bkP4QE):
27606320Sbholler	mov    0x6c(%rdx),%r11
27616320Sbholler	mov    %r11,0x6c(%rcx)
27626320SbhollerL(bkP4QD):
27636320Sbholler	mov    0x64(%rdx),%r10
27646320Sbholler	mov    %r10,0x64(%rcx)
27656320SbhollerL(bkP4QC):
27666320Sbholler	mov    0x5c(%rdx),%r9
27676320Sbholler	mov    %r9,0x5c(%rcx)
27686320SbhollerL(bkP4QB):
27696320Sbholler	mov    0x54(%rdx),%r11
27706320Sbholler	mov    %r11,0x54(%rcx)
27716320SbhollerL(bkP4QA):
27726320Sbholler	mov    0x4c(%rdx),%r10
27736320Sbholler	mov    %r10,0x4c(%rcx)
27746320SbhollerL(bkP4Q9):
27756320Sbholler	mov    0x44(%rdx),%r9
27766320Sbholler	mov    %r9,0x44(%rcx)
27776320SbhollerL(bkP4Q8):
27786320Sbholler	mov    0x3c(%rdx),%r11
27796320Sbholler	mov    %r11,0x3c(%rcx)
27806320SbhollerL(bkP4Q7):
27816320Sbholler	mov    0x34(%rdx),%r10
27826320Sbholler	mov    %r10,0x34(%rcx)
27836320SbhollerL(bkP4Q6):
27846320Sbholler	mov    0x2c(%rdx),%r9
27856320Sbholler	mov    %r9,0x2c(%rcx)
27866320SbhollerL(bkP4Q5):
27876320Sbholler	mov    0x24(%rdx),%r11
27886320Sbholler	mov    %r11,0x24(%rcx)
27896320SbhollerL(bkP4Q4):
27906320Sbholler	mov    0x1c(%rdx),%r10
27916320Sbholler	mov    %r10,0x1c(%rcx)
27926320SbhollerL(bkP4Q3):
27936320Sbholler	mov    0x14(%rdx),%r9
27946320Sbholler	mov    %r9,0x14(%rcx)
27956320SbhollerL(bkP4Q2):
27966320Sbholler	mov    0xc(%rdx),%r11
27976320Sbholler	mov    %r11,0xc(%rcx)
27986320SbhollerL(bkP4Q1):
27996320Sbholler	mov    0x4(%rdx),%r10
28006320Sbholler	mov    %r10,0x4(%rcx)
28016320SbhollerL(bkP4Q0):
28026320Sbholler	mov    (%rdx),%r9d
28036320Sbholler	mov    %r9d,(%rcx)
28046320Sbholler	ret
28056320Sbholler
28066320Sbholler	.balign 16
28076320SbhollerL(bkP5QI):
28086320Sbholler	mov    0x8d(%rdx),%r10
28096320Sbholler	mov    %r10,0x8d(%rcx)
28106320SbhollerL(bkP5QH):
28116320Sbholler	mov    0x85(%rdx),%r9
28126320Sbholler	mov    %r9,0x85(%rcx)
28136320SbhollerL(bkP5QG):
28146320Sbholler	mov    0x7d(%rdx),%r11
28156320Sbholler	mov    %r11,0x7d(%rcx)
28166320SbhollerL(bkP5QF):
28176320Sbholler	mov    0x75(%rdx),%r10
28186320Sbholler	mov    %r10,0x75(%rcx)
28196320SbhollerL(bkP5QE):
28206320Sbholler	mov    0x6d(%rdx),%r9
28216320Sbholler	mov    %r9,0x6d(%rcx)
28226320SbhollerL(bkP5QD):
28236320Sbholler	mov    0x65(%rdx),%r11
28246320Sbholler	mov    %r11,0x65(%rcx)
28256320SbhollerL(bkP5QC):
28266320Sbholler	mov    0x5d(%rdx),%r10
28276320Sbholler	mov    %r10,0x5d(%rcx)
28286320SbhollerL(bkP5QB):
28296320Sbholler	mov    0x55(%rdx),%r9
28306320Sbholler	mov    %r9,0x55(%rcx)
28316320SbhollerL(bkP5QA):
28326320Sbholler	mov    0x4d(%rdx),%r11
28336320Sbholler	mov    %r11,0x4d(%rcx)
28346320SbhollerL(bkP5Q9):
28356320Sbholler	mov    0x45(%rdx),%r10
28366320Sbholler	mov    %r10,0x45(%rcx)
28376320SbhollerL(bkP5Q8):
28386320Sbholler	mov    0x3d(%rdx),%r9
28396320Sbholler	mov    %r9,0x3d(%rcx)
28406320SbhollerL(bkP5Q7):
28416320Sbholler	mov    0x35(%rdx),%r11
28426320Sbholler	mov    %r11,0x35(%rcx)
28436320SbhollerL(bkP5Q6):
28446320Sbholler	mov    0x2d(%rdx),%r10
28456320Sbholler	mov    %r10,0x2d(%rcx)
28466320SbhollerL(bkP5Q5):
28476320Sbholler	mov    0x25(%rdx),%r9
28486320Sbholler	mov    %r9,0x25(%rcx)
28496320SbhollerL(bkP5Q4):
28506320Sbholler	mov    0x1d(%rdx),%r11
28516320Sbholler	mov    %r11,0x1d(%rcx)
28526320SbhollerL(bkP5Q3):
28536320Sbholler	mov    0x15(%rdx),%r10
28546320Sbholler	mov    %r10,0x15(%rcx)
28556320SbhollerL(bkP5Q2):
28566320Sbholler	mov    0xd(%rdx),%r9
28576320Sbholler	mov    %r9,0xd(%rcx)
28586320SbhollerL(bkP5Q1):
28596320Sbholler	mov    0x5(%rdx),%r11
28606320Sbholler	mov    %r11,0x5(%rcx)
28616320SbhollerL(bkP5Q0): # trailing loads/stores do all their loads 1st, then do the stores
28626320Sbholler	mov    0x1(%rdx),%r9d
28636320Sbholler	mov    %r9d,0x1(%rcx)
28646320Sbholler	mov    (%rdx),%r10b
28656320Sbholler	mov    %r10b,(%rcx)
28666320Sbholler	ret
28676320Sbholler
28686320Sbholler	.balign 16
28696320SbhollerL(bkP6QI):
28706320Sbholler	mov    0x8e(%rdx),%r10
28716320Sbholler	mov    %r10,0x8e(%rcx)
28726320SbhollerL(bkP6QH):
28736320Sbholler	mov    0x86(%rdx),%r11
28746320Sbholler	mov    %r11,0x86(%rcx)
28756320SbhollerL(bkP6QG):
28766320Sbholler	mov    0x7e(%rdx),%r10
28776320Sbholler	mov    %r10,0x7e(%rcx)
28786320SbhollerL(bkP6QF):
28796320Sbholler	mov    0x76(%rdx),%r9
28806320Sbholler	mov    %r9,0x76(%rcx)
28816320SbhollerL(bkP6QE):
28826320Sbholler	mov    0x6e(%rdx),%r11
28836320Sbholler	mov    %r11,0x6e(%rcx)
28846320SbhollerL(bkP6QD):
28856320Sbholler	mov    0x66(%rdx),%r10
28866320Sbholler	mov    %r10,0x66(%rcx)
28876320SbhollerL(bkP6QC):
28886320Sbholler	mov    0x5e(%rdx),%r9
28896320Sbholler	mov    %r9,0x5e(%rcx)
28906320SbhollerL(bkP6QB):
28916320Sbholler	mov    0x56(%rdx),%r11
28926320Sbholler	mov    %r11,0x56(%rcx)
28936320SbhollerL(bkP6QA):
28946320Sbholler	mov    0x4e(%rdx),%r10
28956320Sbholler	mov    %r10,0x4e(%rcx)
28966320SbhollerL(bkP6Q9):
28976320Sbholler	mov    0x46(%rdx),%r9
28986320Sbholler	mov    %r9,0x46(%rcx)
28996320SbhollerL(bkP6Q8):
29006320Sbholler	mov    0x3e(%rdx),%r11
29016320Sbholler	mov    %r11,0x3e(%rcx)
29026320SbhollerL(bkP6Q7):
29036320Sbholler	mov    0x36(%rdx),%r10
29046320Sbholler	mov    %r10,0x36(%rcx)
29056320SbhollerL(bkP6Q6):
29066320Sbholler	mov    0x2e(%rdx),%r9
29076320Sbholler	mov    %r9,0x2e(%rcx)
29086320SbhollerL(bkP6Q5):
29096320Sbholler	mov    0x26(%rdx),%r11
29106320Sbholler	mov    %r11,0x26(%rcx)
29116320SbhollerL(bkP6Q4):
29126320Sbholler	mov    0x1e(%rdx),%r10
29136320Sbholler	mov    %r10,0x1e(%rcx)
29146320SbhollerL(bkP6Q3):
29156320Sbholler	mov    0x16(%rdx),%r9
29166320Sbholler	mov    %r9,0x16(%rcx)
29176320SbhollerL(bkP6Q2):
29186320Sbholler	mov    0xe(%rdx),%r11
29196320Sbholler	mov    %r11,0xe(%rcx)
29206320SbhollerL(bkP6Q1):
29216320Sbholler	mov    0x6(%rdx),%r10
29226320Sbholler	mov    %r10,0x6(%rcx)
29236320SbhollerL(bkP6Q0): # trailing loads/stores do all their loads 1st, then do the stores
29246320Sbholler	mov    0x2(%rdx),%r9d
29256320Sbholler	mov    %r9d,0x2(%rcx)
29266320Sbholler	mov    (%rdx),%r10w
29276320Sbholler	mov    %r10w,(%rcx)
29286320Sbholler	ret
29296320Sbholler
29306320Sbholler	.balign 16
29316320SbhollerL(bkP7QI):
29326320Sbholler	mov    0x8f(%rdx),%r10
29336320Sbholler	mov    %r10,0x8f(%rcx)
29346320SbhollerL(bkP7QH):
29356320Sbholler	mov    0x87(%rdx),%r11
29366320Sbholler	mov    %r11,0x87(%rcx)
29376320SbhollerL(bkP7QG):
29386320Sbholler	mov    0x7f(%rdx),%r10
29396320Sbholler	mov    %r10,0x7f(%rcx)
29406320SbhollerL(bkP7QF):
29416320Sbholler	mov    0x77(%rdx),%r9
29426320Sbholler	mov    %r9,0x77(%rcx)
29436320SbhollerL(bkP7QE):
29446320Sbholler	mov    0x6f(%rdx),%r11
29456320Sbholler	mov    %r11,0x6f(%rcx)
29466320SbhollerL(bkP7QD):
29476320Sbholler	mov    0x67(%rdx),%r10
29486320Sbholler	mov    %r10,0x67(%rcx)
29496320SbhollerL(bkP7QC):
29506320Sbholler	mov    0x5f(%rdx),%r9
29516320Sbholler	mov    %r9,0x5f(%rcx)
29526320SbhollerL(bkP7QB):
29536320Sbholler	mov    0x57(%rdx),%r11
29546320Sbholler	mov    %r11,0x57(%rcx)
29556320SbhollerL(bkP7QA):
29566320Sbholler	mov    0x4f(%rdx),%r10
29576320Sbholler	mov    %r10,0x4f(%rcx)
29586320SbhollerL(bkP7Q9):
29596320Sbholler	mov    0x47(%rdx),%r9
29606320Sbholler	mov    %r9,0x47(%rcx)
29616320SbhollerL(bkP7Q8):
29626320Sbholler	mov    0x3f(%rdx),%r11
29636320Sbholler	mov    %r11,0x3f(%rcx)
29646320SbhollerL(bkP7Q7):
29656320Sbholler	mov    0x37(%rdx),%r10
29666320Sbholler	mov    %r10,0x37(%rcx)
29676320SbhollerL(bkP7Q6):
29686320Sbholler	mov    0x2f(%rdx),%r9
29696320Sbholler	mov    %r9,0x2f(%rcx)
29706320SbhollerL(bkP7Q5):
29716320Sbholler	mov    0x27(%rdx),%r11
29726320Sbholler	mov    %r11,0x27(%rcx)
29736320SbhollerL(bkP7Q4):
29746320Sbholler	mov    0x1f(%rdx),%r10
29756320Sbholler	mov    %r10,0x1f(%rcx)
29766320SbhollerL(bkP7Q3):
29776320Sbholler	mov    0x17(%rdx),%r9
29786320Sbholler	mov    %r9,0x17(%rcx)
29796320SbhollerL(bkP7Q2):
29806320Sbholler	mov    0xf(%rdx),%r11
29816320Sbholler	mov    %r11,0xf(%rcx)
29826320SbhollerL(bkP7Q1):
29836320Sbholler	mov    0x7(%rdx),%r10
29846320Sbholler	mov    %r10,0x7(%rcx)
29856320SbhollerL(bkP7Q0): # trailing loads/stores do all their loads 1st, then do the stores
29866320Sbholler	mov    0x3(%rdx),%r9d
29876320Sbholler	mov    %r9d,0x3(%rcx)
29886320Sbholler	mov    0x1(%rdx),%r10w
29896320Sbholler	mov    %r10w,0x1(%rcx)
29906320Sbholler	mov    (%rdx),%r11b
29916320Sbholler	mov    %r11b,(%rcx)
29926320Sbholler	ret
29936320Sbholler
29946320Sbholler		.balign 16
29956320SbhollerL(bkPxQx):	.int L(bkP0Q0)-L(bkPxQx)
29966320Sbholler		.int L(bkP1Q0)-L(bkPxQx)
29976320Sbholler		.int L(bkP2Q0)-L(bkPxQx)
29986320Sbholler		.int L(bkP3Q0)-L(bkPxQx)
29996320Sbholler		.int L(bkP4Q0)-L(bkPxQx)
30006320Sbholler		.int L(bkP5Q0)-L(bkPxQx)
30016320Sbholler		.int L(bkP6Q0)-L(bkPxQx)
30026320Sbholler		.int L(bkP7Q0)-L(bkPxQx)
30036320Sbholler
30046320Sbholler		.int L(bkP0Q1)-L(bkPxQx)
30056320Sbholler		.int L(bkP1Q1)-L(bkPxQx)
30066320Sbholler		.int L(bkP2Q1)-L(bkPxQx)
30076320Sbholler		.int L(bkP3Q1)-L(bkPxQx)
30086320Sbholler		.int L(bkP4Q1)-L(bkPxQx)
30096320Sbholler		.int L(bkP5Q1)-L(bkPxQx)
30106320Sbholler		.int L(bkP6Q1)-L(bkPxQx)
30116320Sbholler		.int L(bkP7Q1)-L(bkPxQx)
30126320Sbholler
30136320Sbholler		.int L(bkP0Q2)-L(bkPxQx)
30146320Sbholler		.int L(bkP1Q2)-L(bkPxQx)
30156320Sbholler		.int L(bkP2Q2)-L(bkPxQx)
30166320Sbholler		.int L(bkP3Q2)-L(bkPxQx)
30176320Sbholler		.int L(bkP4Q2)-L(bkPxQx)
30186320Sbholler		.int L(bkP5Q2)-L(bkPxQx)
30196320Sbholler		.int L(bkP6Q2)-L(bkPxQx)
30206320Sbholler		.int L(bkP7Q2)-L(bkPxQx)
30216320Sbholler
30226320Sbholler		.int L(bkP0Q3)-L(bkPxQx)
30236320Sbholler		.int L(bkP1Q3)-L(bkPxQx)
30246320Sbholler		.int L(bkP2Q3)-L(bkPxQx)
30256320Sbholler		.int L(bkP3Q3)-L(bkPxQx)
30266320Sbholler		.int L(bkP4Q3)-L(bkPxQx)
30276320Sbholler		.int L(bkP5Q3)-L(bkPxQx)
30286320Sbholler		.int L(bkP6Q3)-L(bkPxQx)
30296320Sbholler		.int L(bkP7Q3)-L(bkPxQx)
30306320Sbholler
30316320Sbholler		.int L(bkP0Q4)-L(bkPxQx)
30326320Sbholler		.int L(bkP1Q4)-L(bkPxQx)
30336320Sbholler		.int L(bkP2Q4)-L(bkPxQx)
30346320Sbholler		.int L(bkP3Q4)-L(bkPxQx)
30356320Sbholler		.int L(bkP4Q4)-L(bkPxQx)
30366320Sbholler		.int L(bkP5Q4)-L(bkPxQx)
30376320Sbholler		.int L(bkP6Q4)-L(bkPxQx)
30386320Sbholler		.int L(bkP7Q4)-L(bkPxQx)
30396320Sbholler
30406320Sbholler		.int L(bkP0Q5)-L(bkPxQx)
30416320Sbholler		.int L(bkP1Q5)-L(bkPxQx)
30426320Sbholler		.int L(bkP2Q5)-L(bkPxQx)
30436320Sbholler		.int L(bkP3Q5)-L(bkPxQx)
30446320Sbholler		.int L(bkP4Q5)-L(bkPxQx)
30456320Sbholler		.int L(bkP5Q5)-L(bkPxQx)
30466320Sbholler		.int L(bkP6Q5)-L(bkPxQx)
30476320Sbholler		.int L(bkP7Q5)-L(bkPxQx)
30486320Sbholler
30496320Sbholler		.int L(bkP0Q6)-L(bkPxQx)
30506320Sbholler		.int L(bkP1Q6)-L(bkPxQx)
30516320Sbholler		.int L(bkP2Q6)-L(bkPxQx)
30526320Sbholler		.int L(bkP3Q6)-L(bkPxQx)
30536320Sbholler		.int L(bkP4Q6)-L(bkPxQx)
30546320Sbholler		.int L(bkP5Q6)-L(bkPxQx)
30556320Sbholler		.int L(bkP6Q6)-L(bkPxQx)
30566320Sbholler		.int L(bkP7Q6)-L(bkPxQx)
30576320Sbholler
30586320Sbholler		.int L(bkP0Q7)-L(bkPxQx)
30596320Sbholler		.int L(bkP1Q7)-L(bkPxQx)
30606320Sbholler		.int L(bkP2Q7)-L(bkPxQx)
30616320Sbholler		.int L(bkP3Q7)-L(bkPxQx)
30626320Sbholler		.int L(bkP4Q7)-L(bkPxQx)
30636320Sbholler		.int L(bkP5Q7)-L(bkPxQx)
30646320Sbholler		.int L(bkP6Q7)-L(bkPxQx)
30656320Sbholler		.int L(bkP7Q7)-L(bkPxQx)
30666320Sbholler
30676320Sbholler		.int L(bkP0Q8)-L(bkPxQx)
30686320Sbholler		.int L(bkP1Q8)-L(bkPxQx)
30696320Sbholler		.int L(bkP2Q8)-L(bkPxQx)
30706320Sbholler		.int L(bkP3Q8)-L(bkPxQx)
30716320Sbholler		.int L(bkP4Q8)-L(bkPxQx)
30726320Sbholler		.int L(bkP5Q8)-L(bkPxQx)
30736320Sbholler		.int L(bkP6Q8)-L(bkPxQx)
30746320Sbholler		.int L(bkP7Q8)-L(bkPxQx)
30756320Sbholler
30766320Sbholler		.int L(bkP0Q9)-L(bkPxQx)
30776320Sbholler		.int L(bkP1Q9)-L(bkPxQx)
30786320Sbholler		.int L(bkP2Q9)-L(bkPxQx)
30796320Sbholler		.int L(bkP3Q9)-L(bkPxQx)
30806320Sbholler		.int L(bkP4Q9)-L(bkPxQx)
30816320Sbholler		.int L(bkP5Q9)-L(bkPxQx)
30826320Sbholler		.int L(bkP6Q9)-L(bkPxQx)
30836320Sbholler		.int L(bkP7Q9)-L(bkPxQx)
30846320Sbholler
30856320Sbholler		.int L(bkP0QA)-L(bkPxQx)
30866320Sbholler		.int L(bkP1QA)-L(bkPxQx)
30876320Sbholler		.int L(bkP2QA)-L(bkPxQx)
30886320Sbholler		.int L(bkP3QA)-L(bkPxQx)
30896320Sbholler		.int L(bkP4QA)-L(bkPxQx)
30906320Sbholler		.int L(bkP5QA)-L(bkPxQx)
30916320Sbholler		.int L(bkP6QA)-L(bkPxQx)
30926320Sbholler		.int L(bkP7QA)-L(bkPxQx)
30936320Sbholler
30946320Sbholler		.int L(bkP0QB)-L(bkPxQx)
30956320Sbholler		.int L(bkP1QB)-L(bkPxQx)
30966320Sbholler		.int L(bkP2QB)-L(bkPxQx)
30976320Sbholler		.int L(bkP3QB)-L(bkPxQx)
30986320Sbholler		.int L(bkP4QB)-L(bkPxQx)
30996320Sbholler		.int L(bkP5QB)-L(bkPxQx)
31006320Sbholler		.int L(bkP6QB)-L(bkPxQx)
31016320Sbholler		.int L(bkP7QB)-L(bkPxQx)
31026320Sbholler
31036320Sbholler		.int L(bkP0QC)-L(bkPxQx)
31046320Sbholler		.int L(bkP1QC)-L(bkPxQx)
31056320Sbholler		.int L(bkP2QC)-L(bkPxQx)
31066320Sbholler		.int L(bkP3QC)-L(bkPxQx)
31076320Sbholler		.int L(bkP4QC)-L(bkPxQx)
31086320Sbholler		.int L(bkP5QC)-L(bkPxQx)
31096320Sbholler		.int L(bkP6QC)-L(bkPxQx)
31106320Sbholler		.int L(bkP7QC)-L(bkPxQx)
31116320Sbholler
31126320Sbholler		.int L(bkP0QD)-L(bkPxQx)
31136320Sbholler		.int L(bkP1QD)-L(bkPxQx)
31146320Sbholler		.int L(bkP2QD)-L(bkPxQx)
31156320Sbholler		.int L(bkP3QD)-L(bkPxQx)
31166320Sbholler		.int L(bkP4QD)-L(bkPxQx)
31176320Sbholler		.int L(bkP5QD)-L(bkPxQx)
31186320Sbholler		.int L(bkP6QD)-L(bkPxQx)
31196320Sbholler		.int L(bkP7QD)-L(bkPxQx)
31206320Sbholler
31216320Sbholler		.int L(bkP0QE)-L(bkPxQx)
31226320Sbholler		.int L(bkP1QE)-L(bkPxQx)
31236320Sbholler		.int L(bkP2QE)-L(bkPxQx)
31246320Sbholler		.int L(bkP3QE)-L(bkPxQx)
31256320Sbholler		.int L(bkP4QE)-L(bkPxQx)
31266320Sbholler		.int L(bkP5QE)-L(bkPxQx)
31276320Sbholler		.int L(bkP6QE)-L(bkPxQx)
31286320Sbholler		.int L(bkP7QE)-L(bkPxQx)
31296320Sbholler
31306320Sbholler		.int L(bkP0QF)-L(bkPxQx)
31316320Sbholler		.int L(bkP1QF)-L(bkPxQx)
31326320Sbholler		.int L(bkP2QF)-L(bkPxQx)
31336320Sbholler		.int L(bkP3QF)-L(bkPxQx)
31346320Sbholler		.int L(bkP4QF)-L(bkPxQx)
31356320Sbholler		.int L(bkP5QF)-L(bkPxQx)
31366320Sbholler		.int L(bkP6QF)-L(bkPxQx)
31376320Sbholler		.int L(bkP7QF)-L(bkPxQx)
31386320Sbholler
31396320Sbholler		.int L(bkP0QG)-L(bkPxQx)
31406320Sbholler		.int L(bkP1QG)-L(bkPxQx)
31416320Sbholler		.int L(bkP2QG)-L(bkPxQx)
31426320Sbholler		.int L(bkP3QG)-L(bkPxQx)
31436320Sbholler		.int L(bkP4QG)-L(bkPxQx)
31446320Sbholler		.int L(bkP5QG)-L(bkPxQx)
31456320Sbholler		.int L(bkP6QG)-L(bkPxQx)
31466320Sbholler		.int L(bkP7QG)-L(bkPxQx)
31476320Sbholler
31486320Sbholler		.int L(bkP0QH)-L(bkPxQx)
31496320Sbholler		.int L(bkP1QH)-L(bkPxQx)
31506320Sbholler		.int L(bkP2QH)-L(bkPxQx)
31516320Sbholler		.int L(bkP3QH)-L(bkPxQx)
31526320Sbholler		.int L(bkP4QH)-L(bkPxQx)
31536320Sbholler		.int L(bkP5QH)-L(bkPxQx)
31546320Sbholler		.int L(bkP6QH)-L(bkPxQx)
31556320Sbholler		.int L(bkP7QH)-L(bkPxQx)
31566320Sbholler
31576320Sbholler		.int L(bkP0QI)-L(bkPxQx)
31586320Sbholler		.int L(bkP1QI)-L(bkPxQx)
31596320Sbholler		.int L(bkP2QI)-L(bkPxQx)
31606320Sbholler		.int L(bkP3QI)-L(bkPxQx)
31616320Sbholler		.int L(bkP4QI)-L(bkPxQx)
31626320Sbholler		.int L(bkP5QI)-L(bkPxQx)
31636320Sbholler		.int L(bkP6QI)-L(bkPxQx)
31646320Sbholler		.int L(bkP7QI)-L(bkPxQx)
31656320Sbholler
31660Sstevel@tonic-gate	SET_SIZE(memmove)
3167