10Sstevel@tonic-gate/* 26320Sbholler * CDDL HEADER START 36320Sbholler * 46320Sbholler * The contents of this file are subject to the terms of the 56320Sbholler * Common Development and Distribution License (the "License"). 66320Sbholler * You may not use this file except in compliance with the License. 76320Sbholler * 86320Sbholler * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 96320Sbholler * or http://www.opensolaris.org/os/licensing. 106320Sbholler * See the License for the specific language governing permissions 116320Sbholler * and limitations under the License. 126320Sbholler * 136320Sbholler * When distributing Covered Code, include this CDDL HEADER in each 146320Sbholler * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 156320Sbholler * If applicable, add the following below this CDDL HEADER, with the 166320Sbholler * fields enclosed by brackets "[]" replaced with your own identifying 176320Sbholler * information: Portions Copyright [yyyy] [name of copyright owner] 186320Sbholler * 196320Sbholler * CDDL HEADER END 200Sstevel@tonic-gate */ 210Sstevel@tonic-gate 220Sstevel@tonic-gate/* 236812Sraf * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 246812Sraf * Use is subject to license terms. 256812Sraf */ 266812Sraf 276812Sraf/* 286320Sbholler * Copyright (c) 2008, Intel Corporation 290Sstevel@tonic-gate * All rights reserved. 300Sstevel@tonic-gate */ 310Sstevel@tonic-gate 326320Sbholler/* 336320Sbholler * memcpy.s - copies two blocks of memory 346320Sbholler * Implements memcpy() and memmove() libc primitives. 356320Sbholler */ 366812Sraf 37*7298SMark.J.Nelson@Sun.COM .file "memcpy.s" 380Sstevel@tonic-gate 390Sstevel@tonic-gate#include <sys/asm_linkage.h> 406812Sraf 410Sstevel@tonic-gate ANSI_PRAGMA_WEAK(memmove,function) 420Sstevel@tonic-gate ANSI_PRAGMA_WEAK(memcpy,function) 430Sstevel@tonic-gate 440Sstevel@tonic-gate#include "cache.h" 456320Sbholler#include "proc64_id.h" 460Sstevel@tonic-gate 476320Sbholler#define L(s) .memcpy/**/s 486320Sbholler 496320Sbholler/* 506320Sbholler * memcpy algorithm overview: 516320Sbholler * 526320Sbholler * Thresholds used below were determined experimentally. 536320Sbholler * 546320Sbholler * Pseudo code: 556320Sbholler * 566320Sbholler * If (size <= 128 bytes) { 576320Sbholler * do unrolled code (primarily 8-byte loads/stores) regardless of 586320Sbholler * alignment. 596320Sbholler * } else { 606320Sbholler * Align destination to 16-byte boundary 616320Sbholler * 626320Sbholler * if (NO_SSE) { 636320Sbholler * If (size > half of the largest level cache) { 646320Sbholler * Use 8-byte non-temporal stores (64-bytes/loop) 656320Sbholler * } else { 666320Sbholler * if (size > 4K && size <= half l1 cache size) { 676320Sbholler * Use rep movsq 686320Sbholler * } else { 696320Sbholler * Use 8-byte loads/stores (64 bytes per loop) 706320Sbholler * } 716320Sbholler * } 726320Sbholler * 736320Sbholler * } else { **USE SSE** 746320Sbholler * If (size > half of the largest level cache) { 756320Sbholler * Use 16-byte non-temporal stores (128-bytes per loop) 766320Sbholler * } else { 776320Sbholler * If (both source and destination are aligned) { 786320Sbholler * Use 16-byte aligned loads and stores (128 bytes/loop) 796320Sbholler * } else { 806320Sbholler * use pairs of xmm registers with SSE2 or SSSE3 816320Sbholler * instructions to concatenate and shift appropriately 826320Sbholler * to account for source unalignment. This enables 836320Sbholler * 16-byte aligned loads to be done. 846320Sbholler * } 856320Sbholler * } 866320Sbholler } 876320Sbholler * 886320Sbholler * Finish any remaining bytes via unrolled code above. 896320Sbholler * } 906320Sbholler * 916320Sbholler * memmove overview: 926320Sbholler * memmove is the same as memcpy except one case where copy needs to be 936320Sbholler * done backwards. The copy backwards code is done in a similar manner. 946320Sbholler */ 956320Sbholler 966320Sbholler ENTRY(memmove) 976320Sbholler cmp %rsi,%rdi # if dst <= src 986320Sbholler jbe L(CopyForward) # then do copy forward 996320Sbholler mov %rsi,%r9 # move src to r9 1006320Sbholler add %rdx,%r9 # add len to get addr of end of src 1016320Sbholler cmp %r9,%rdi # if dst < end of src 1026320Sbholler jb L(CopyBackwards) # then do copy backwards 1036320Sbholler jmp L(CopyForward) 1046320Sbholler 1056320Sbholler ENTRY (memcpy) 1066320SbhollerL(CopyForward): 1076320Sbholler mov %rdx,%r8 1086320Sbholler mov %rdi,%rcx 1096320Sbholler mov %rsi,%rdx 1106320Sbholler mov %rdi,%rax 1116320Sbholler lea L(fwdPxQx)(%rip),%r11 1126320Sbholler cmp $0x80,%r8 # 128 1136320Sbholler jg L(ck_use_sse2) 1146320Sbholler add %r8,%rcx 1156320Sbholler add %r8,%rdx 1166320Sbholler 1176320Sbholler movslq (%r11,%r8,4),%r10 1186320Sbholler lea (%r10,%r11,1),%r11 1196320Sbholler jmpq *%r11 1206320Sbholler 1216320Sbholler .balign 16 1226320SbhollerL(ShrtAlignNew): 1236320Sbholler lea L(AliPxQx)(%rip),%r11 1246320Sbholler mov %rcx,%r9 1256320Sbholler and $0xf,%r9 1266320Sbholler 1276320Sbholler movslq (%r11,%r9,4),%r10 1286320Sbholler lea (%r10,%r11,1),%r11 1296320Sbholler jmpq *%r11 1306320Sbholler 1316320Sbholler .balign 16 1326320SbhollerL(fwdPxQx): .int L(P0Q0)-L(fwdPxQx) 1336320Sbholler .int L(P1Q0)-L(fwdPxQx) 1346320Sbholler .int L(P2Q0)-L(fwdPxQx) 1356320Sbholler .int L(P3Q0)-L(fwdPxQx) 1366320Sbholler .int L(P4Q0)-L(fwdPxQx) 1376320Sbholler .int L(P5Q0)-L(fwdPxQx) 1386320Sbholler .int L(P6Q0)-L(fwdPxQx) 1396320Sbholler .int L(P7Q0)-L(fwdPxQx) 1406320Sbholler 1416320Sbholler .int L(P0Q1)-L(fwdPxQx) 1426320Sbholler .int L(P1Q1)-L(fwdPxQx) 1436320Sbholler .int L(P2Q1)-L(fwdPxQx) 1446320Sbholler .int L(P3Q1)-L(fwdPxQx) 1456320Sbholler .int L(P4Q1)-L(fwdPxQx) 1466320Sbholler .int L(P5Q1)-L(fwdPxQx) 1476320Sbholler .int L(P6Q1)-L(fwdPxQx) 1486320Sbholler .int L(P7Q1)-L(fwdPxQx) 1496320Sbholler 1506320Sbholler .int L(P0Q2)-L(fwdPxQx) 1516320Sbholler .int L(P1Q2)-L(fwdPxQx) 1526320Sbholler .int L(P2Q2)-L(fwdPxQx) 1536320Sbholler .int L(P3Q2)-L(fwdPxQx) 1546320Sbholler .int L(P4Q2)-L(fwdPxQx) 1556320Sbholler .int L(P5Q2)-L(fwdPxQx) 1566320Sbholler .int L(P6Q2)-L(fwdPxQx) 1576320Sbholler .int L(P7Q2)-L(fwdPxQx) 1586320Sbholler 1596320Sbholler .int L(P0Q3)-L(fwdPxQx) 1606320Sbholler .int L(P1Q3)-L(fwdPxQx) 1616320Sbholler .int L(P2Q3)-L(fwdPxQx) 1626320Sbholler .int L(P3Q3)-L(fwdPxQx) 1636320Sbholler .int L(P4Q3)-L(fwdPxQx) 1646320Sbholler .int L(P5Q3)-L(fwdPxQx) 1656320Sbholler .int L(P6Q3)-L(fwdPxQx) 1666320Sbholler .int L(P7Q3)-L(fwdPxQx) 1676320Sbholler 1686320Sbholler .int L(P0Q4)-L(fwdPxQx) 1696320Sbholler .int L(P1Q4)-L(fwdPxQx) 1706320Sbholler .int L(P2Q4)-L(fwdPxQx) 1716320Sbholler .int L(P3Q4)-L(fwdPxQx) 1726320Sbholler .int L(P4Q4)-L(fwdPxQx) 1736320Sbholler .int L(P5Q4)-L(fwdPxQx) 1746320Sbholler .int L(P6Q4)-L(fwdPxQx) 1756320Sbholler .int L(P7Q4)-L(fwdPxQx) 1766320Sbholler 1776320Sbholler .int L(P0Q5)-L(fwdPxQx) 1786320Sbholler .int L(P1Q5)-L(fwdPxQx) 1796320Sbholler .int L(P2Q5)-L(fwdPxQx) 1806320Sbholler .int L(P3Q5)-L(fwdPxQx) 1816320Sbholler .int L(P4Q5)-L(fwdPxQx) 1826320Sbholler .int L(P5Q5)-L(fwdPxQx) 1836320Sbholler .int L(P6Q5)-L(fwdPxQx) 1846320Sbholler .int L(P7Q5)-L(fwdPxQx) 1856320Sbholler 1866320Sbholler .int L(P0Q6)-L(fwdPxQx) 1876320Sbholler .int L(P1Q6)-L(fwdPxQx) 1886320Sbholler .int L(P2Q6)-L(fwdPxQx) 1896320Sbholler .int L(P3Q6)-L(fwdPxQx) 1906320Sbholler .int L(P4Q6)-L(fwdPxQx) 1916320Sbholler .int L(P5Q6)-L(fwdPxQx) 1926320Sbholler .int L(P6Q6)-L(fwdPxQx) 1936320Sbholler .int L(P7Q6)-L(fwdPxQx) 1946320Sbholler 1956320Sbholler .int L(P0Q7)-L(fwdPxQx) 1966320Sbholler .int L(P1Q7)-L(fwdPxQx) 1976320Sbholler .int L(P2Q7)-L(fwdPxQx) 1986320Sbholler .int L(P3Q7)-L(fwdPxQx) 1996320Sbholler .int L(P4Q7)-L(fwdPxQx) 2006320Sbholler .int L(P5Q7)-L(fwdPxQx) 2016320Sbholler .int L(P6Q7)-L(fwdPxQx) 2026320Sbholler .int L(P7Q7)-L(fwdPxQx) 2036320Sbholler 2046320Sbholler .int L(P0Q8)-L(fwdPxQx) 2056320Sbholler .int L(P1Q8)-L(fwdPxQx) 2066320Sbholler .int L(P2Q8)-L(fwdPxQx) 2076320Sbholler .int L(P3Q8)-L(fwdPxQx) 2086320Sbholler .int L(P4Q8)-L(fwdPxQx) 2096320Sbholler .int L(P5Q8)-L(fwdPxQx) 2106320Sbholler .int L(P6Q8)-L(fwdPxQx) 2116320Sbholler .int L(P7Q8)-L(fwdPxQx) 2126320Sbholler 2136320Sbholler .int L(P0Q9)-L(fwdPxQx) 2146320Sbholler .int L(P1Q9)-L(fwdPxQx) 2156320Sbholler .int L(P2Q9)-L(fwdPxQx) 2166320Sbholler .int L(P3Q9)-L(fwdPxQx) 2176320Sbholler .int L(P4Q9)-L(fwdPxQx) 2186320Sbholler .int L(P5Q9)-L(fwdPxQx) 2196320Sbholler .int L(P6Q9)-L(fwdPxQx) 2206320Sbholler .int L(P7Q9)-L(fwdPxQx) 2216320Sbholler 2226320Sbholler .int L(P0QA)-L(fwdPxQx) 2236320Sbholler .int L(P1QA)-L(fwdPxQx) 2246320Sbholler .int L(P2QA)-L(fwdPxQx) 2256320Sbholler .int L(P3QA)-L(fwdPxQx) 2266320Sbholler .int L(P4QA)-L(fwdPxQx) 2276320Sbholler .int L(P5QA)-L(fwdPxQx) 2286320Sbholler .int L(P6QA)-L(fwdPxQx) 2296320Sbholler .int L(P7QA)-L(fwdPxQx) 2306320Sbholler 2316320Sbholler .int L(P0QB)-L(fwdPxQx) 2326320Sbholler .int L(P1QB)-L(fwdPxQx) 2336320Sbholler .int L(P2QB)-L(fwdPxQx) 2346320Sbholler .int L(P3QB)-L(fwdPxQx) 2356320Sbholler .int L(P4QB)-L(fwdPxQx) 2366320Sbholler .int L(P5QB)-L(fwdPxQx) 2376320Sbholler .int L(P6QB)-L(fwdPxQx) 2386320Sbholler .int L(P7QB)-L(fwdPxQx) 2396320Sbholler 2406320Sbholler .int L(P0QC)-L(fwdPxQx) 2416320Sbholler .int L(P1QC)-L(fwdPxQx) 2426320Sbholler .int L(P2QC)-L(fwdPxQx) 2436320Sbholler .int L(P3QC)-L(fwdPxQx) 2446320Sbholler .int L(P4QC)-L(fwdPxQx) 2456320Sbholler .int L(P5QC)-L(fwdPxQx) 2466320Sbholler .int L(P6QC)-L(fwdPxQx) 2476320Sbholler .int L(P7QC)-L(fwdPxQx) 2486320Sbholler 2496320Sbholler .int L(P0QD)-L(fwdPxQx) 2506320Sbholler .int L(P1QD)-L(fwdPxQx) 2516320Sbholler .int L(P2QD)-L(fwdPxQx) 2526320Sbholler .int L(P3QD)-L(fwdPxQx) 2536320Sbholler .int L(P4QD)-L(fwdPxQx) 2546320Sbholler .int L(P5QD)-L(fwdPxQx) 2556320Sbholler .int L(P6QD)-L(fwdPxQx) 2566320Sbholler .int L(P7QD)-L(fwdPxQx) 2576320Sbholler 2586320Sbholler .int L(P0QE)-L(fwdPxQx) 2596320Sbholler .int L(P1QE)-L(fwdPxQx) 2606320Sbholler .int L(P2QE)-L(fwdPxQx) 2616320Sbholler .int L(P3QE)-L(fwdPxQx) 2626320Sbholler .int L(P4QE)-L(fwdPxQx) 2636320Sbholler .int L(P5QE)-L(fwdPxQx) 2646320Sbholler .int L(P6QE)-L(fwdPxQx) 2656320Sbholler .int L(P7QE)-L(fwdPxQx) 2666320Sbholler 2676320Sbholler .int L(P0QF)-L(fwdPxQx) 2686320Sbholler .int L(P1QF)-L(fwdPxQx) 2696320Sbholler .int L(P2QF)-L(fwdPxQx) 2706320Sbholler .int L(P3QF)-L(fwdPxQx) 2716320Sbholler .int L(P4QF)-L(fwdPxQx) 2726320Sbholler .int L(P5QF)-L(fwdPxQx) 2736320Sbholler .int L(P6QF)-L(fwdPxQx) 2746320Sbholler .int L(P7QF)-L(fwdPxQx) 2756320Sbholler 2766320Sbholler .int L(P0QG)-L(fwdPxQx) # 0x80 2776320Sbholler 2786320Sbholler .balign 16 2796320SbhollerL(AliPxQx): .int L(now_qw_aligned)-L(AliPxQx) 2806320Sbholler .int L(A1Q0)-L(AliPxQx) 2816320Sbholler .int L(A2Q0)-L(AliPxQx) 2826320Sbholler .int L(A3Q0)-L(AliPxQx) 2836320Sbholler .int L(A4Q0)-L(AliPxQx) 2846320Sbholler .int L(A5Q0)-L(AliPxQx) 2856320Sbholler .int L(A6Q0)-L(AliPxQx) 2866320Sbholler .int L(A7Q0)-L(AliPxQx) 2876320Sbholler .int L(A0Q1)-L(AliPxQx) 2886320Sbholler .int L(A1Q1)-L(AliPxQx) 2896320Sbholler .int L(A2Q1)-L(AliPxQx) 2906320Sbholler .int L(A3Q1)-L(AliPxQx) 2916320Sbholler .int L(A4Q1)-L(AliPxQx) 2926320Sbholler .int L(A5Q1)-L(AliPxQx) 2936320Sbholler .int L(A6Q1)-L(AliPxQx) 2946320Sbholler .int L(A7Q1)-L(AliPxQx) 2956320Sbholler 2966320Sbholler .balign 16 2976320SbhollerL(A1Q0): # ; need to move 8+ 7=1+2+4 bytes 2986320Sbholler movzbq (%rdx),%r11 2996320Sbholler sub $0xf,%r8 3006320Sbholler mov %r11b,(%rcx) 3016320Sbholler 3026320Sbholler movzwq 0x1(%rdx),%r10 3036320Sbholler mov %r10w,0x1(%rcx) 3046320Sbholler 3056320Sbholler mov 0x3(%rdx),%r9d 3066320Sbholler mov %r9d,0x3(%rcx) 3076320Sbholler 3086320Sbholler mov 0x7(%rdx),%r11 3096320Sbholler add $0xf,%rdx 3106320Sbholler mov %r11,0x7(%rcx) 3116320Sbholler 3126320Sbholler add $0xf,%rcx 3136320Sbholler jmp L(now_qw_aligned) 3146320Sbholler 3156320Sbholler .balign 16 3166320SbhollerL(A2Q0): # ; need to move 8+ 6=2+4 bytes 3176320Sbholler movzwq (%rdx),%r10 3186320Sbholler sub $0xe,%r8 3196320Sbholler mov %r10w,(%rcx) 3206320Sbholler 3216320Sbholler mov 0x2(%rdx),%r9d 3226320Sbholler mov %r9d,0x2(%rcx) 3236320Sbholler 3246320Sbholler mov 0x6(%rdx),%r11 3256320Sbholler add $0xe,%rdx 3266320Sbholler mov %r11,0x6(%rcx) 3276320Sbholler add $0xe,%rcx 3286320Sbholler jmp L(now_qw_aligned) 3296320Sbholler 3306320Sbholler .balign 16 3316320SbhollerL(A3Q0): # ; need to move 8+ 5=1+4 bytes 3326320Sbholler movzbq (%rdx),%r11 3336320Sbholler sub $0xd,%r8 3346320Sbholler mov %r11b,(%rcx) 3356320Sbholler 3366320Sbholler mov 0x1(%rdx),%r9d 3376320Sbholler mov %r9d,0x1(%rcx) 3386320Sbholler 3396320Sbholler mov 0x5(%rdx),%r10 3406320Sbholler add $0xd,%rdx 3416320Sbholler mov %r10,0x5(%rcx) 3426320Sbholler 3436320Sbholler add $0xd,%rcx 3446320Sbholler jmp L(now_qw_aligned) 3456320Sbholler 3466320Sbholler .balign 16 3476320SbhollerL(A4Q0): # ; need to move 8+4 bytes 3486320Sbholler mov (%rdx),%r9d 3496320Sbholler sub $0xc,%r8 3506320Sbholler mov %r9d,(%rcx) 3516320Sbholler 3526320Sbholler mov 0x4(%rdx),%r10 3536320Sbholler add $0xc,%rdx 3546320Sbholler mov %r10,0x4(%rcx) 3556320Sbholler 3566320Sbholler add $0xc,%rcx 3576320Sbholler jmp L(now_qw_aligned) 3586320Sbholler 3596320Sbholler .balign 16 3606320SbhollerL(A5Q0): # ; need to move 8+ 3=1+2 bytes 3616320Sbholler movzbq (%rdx),%r11 3626320Sbholler sub $0xb,%r8 3636320Sbholler mov %r11b,(%rcx) 3646320Sbholler 3656320Sbholler movzwq 0x1(%rdx),%r10 3666320Sbholler mov %r10w,0x1(%rcx) 3676320Sbholler 3686320Sbholler mov 0x3(%rdx),%r9 3696320Sbholler add $0xb,%rdx 3706320Sbholler mov %r9,0x3(%rcx) 3716320Sbholler 3726320Sbholler add $0xb,%rcx 3736320Sbholler jmp L(now_qw_aligned) 3746320Sbholler 3756320Sbholler .balign 16 3766320SbhollerL(A6Q0): # ; need to move 8+2 bytes 3776320Sbholler movzwq (%rdx),%r10 3786320Sbholler sub $0xa,%r8 3796320Sbholler mov %r10w,(%rcx) 3806320Sbholler 3816320Sbholler mov 0x2(%rdx),%r9 3826320Sbholler add $0xa,%rdx 3836320Sbholler mov %r9,0x2(%rcx) 3846320Sbholler 3856320Sbholler add $0xa,%rcx 3866320Sbholler jmp L(now_qw_aligned) 3876320Sbholler 3886320Sbholler .balign 16 3896320SbhollerL(A7Q0): # ; need to move 8+1 byte 3906320Sbholler movzbq (%rdx),%r11 3916320Sbholler sub $0x9,%r8 3926320Sbholler mov %r11b,(%rcx) 3936320Sbholler 3946320Sbholler mov 0x1(%rdx),%r10 3956320Sbholler add $0x9,%rdx 3966320Sbholler mov %r10,0x1(%rcx) 3976320Sbholler 3986320Sbholler add $0x9,%rcx 3996320Sbholler jmp L(now_qw_aligned) 4006320Sbholler 4016320Sbholler .balign 16 4026320SbhollerL(A0Q1): # ; need to move 8 bytes 4036320Sbholler 4046320Sbholler mov (%rdx),%r10 4056320Sbholler add $0x8,%rdx 4066320Sbholler sub $0x8,%r8 4076320Sbholler mov %r10,(%rcx) 4086320Sbholler 4096320Sbholler add $0x8,%rcx 4106320Sbholler jmp L(now_qw_aligned) 4116320Sbholler 4126320Sbholler .balign 16 4136320SbhollerL(A1Q1): # ; need to move 7=1+2+4 bytes 4146320Sbholler movzbq (%rdx),%r11 4156320Sbholler sub $0x7,%r8 4166320Sbholler mov %r11b,(%rcx) 4176320Sbholler 4186320Sbholler movzwq 0x1(%rdx),%r10 4196320Sbholler mov %r10w,0x1(%rcx) 4206320Sbholler 4216320Sbholler mov 0x3(%rdx),%r9d 4226320Sbholler add $0x7,%rdx 4236320Sbholler mov %r9d,0x3(%rcx) 4246320Sbholler add $0x7,%rcx 4256320Sbholler jmp L(now_qw_aligned) 4266320Sbholler 4276320Sbholler .balign 16 4286320SbhollerL(A2Q1): # ; need to move 6=2+4 bytes 4296320Sbholler movzwq (%rdx),%r10 4306320Sbholler sub $0x6,%r8 4316320Sbholler mov %r10w,(%rcx) 4326320Sbholler mov 0x2(%rdx),%r9d 4336320Sbholler add $0x6,%rdx 4346320Sbholler mov %r9d,0x2(%rcx) 4356320Sbholler add $0x6,%rcx 4366320Sbholler jmp L(now_qw_aligned) 4376320Sbholler 4386320Sbholler .balign 16 4396320SbhollerL(A3Q1): # ; need to move 5=1+4 bytes 4406320Sbholler movzbq (%rdx),%r11 4416320Sbholler sub $0x5,%r8 4426320Sbholler mov %r11b,(%rcx) 4436320Sbholler mov 0x1(%rdx),%r9d 4446320Sbholler add $0x5,%rdx 4456320Sbholler mov %r9d,0x1(%rcx) 4466320Sbholler add $0x5,%rcx 4476320Sbholler jmp L(now_qw_aligned) 4486320Sbholler 4496320Sbholler .balign 16 4506320SbhollerL(A4Q1): # ; need to move 4 bytes 4516320Sbholler mov (%rdx),%r9d 4526320Sbholler sub $0x4,%r8 4536320Sbholler add $0x4,%rdx 4546320Sbholler mov %r9d,(%rcx) 4556320Sbholler add $0x4,%rcx 4566320Sbholler jmp L(now_qw_aligned) 4576320Sbholler 4586320Sbholler .balign 16 4596320SbhollerL(A5Q1): # ; need to move 3=1+2 bytes 4606320Sbholler movzbq (%rdx),%r11 4616320Sbholler sub $0x3,%r8 4626320Sbholler mov %r11b,(%rcx) 4636320Sbholler 4646320Sbholler movzwq 0x1(%rdx),%r10 4656320Sbholler add $0x3,%rdx 4666320Sbholler mov %r10w,0x1(%rcx) 4676320Sbholler 4686320Sbholler add $0x3,%rcx 4696320Sbholler jmp L(now_qw_aligned) 4706320Sbholler 4716320Sbholler .balign 16 4726320SbhollerL(A6Q1): # ; need to move 2 bytes 4736320Sbholler movzwq (%rdx),%r10 4746320Sbholler sub $0x2,%r8 4756320Sbholler add $0x2,%rdx 4766320Sbholler mov %r10w,(%rcx) 4776320Sbholler add $0x2,%rcx 4786320Sbholler jmp L(now_qw_aligned) 4796320Sbholler 4806320Sbholler .balign 16 4816320SbhollerL(A7Q1): # ; need to move 1 byte 4826320Sbholler movzbq (%rdx),%r11 4836320Sbholler dec %r8 4846320Sbholler inc %rdx 4856320Sbholler mov %r11b,(%rcx) 4866320Sbholler inc %rcx 4876320Sbholler jmp L(now_qw_aligned) 4886320Sbholler 4896320Sbholler 4906320Sbholler .balign 16 4916320SbhollerL(P0QG): 4926320Sbholler mov -0x80(%rdx),%r9 4936320Sbholler mov %r9,-0x80(%rcx) 4946320SbhollerL(P0QF): 4956320Sbholler mov -0x78(%rdx),%r10 4966320Sbholler mov %r10,-0x78(%rcx) 4976320SbhollerL(P0QE): 4986320Sbholler mov -0x70(%rdx),%r9 4996320Sbholler mov %r9,-0x70(%rcx) 5006320SbhollerL(P0QD): 5016320Sbholler mov -0x68(%rdx),%r10 5026320Sbholler mov %r10,-0x68(%rcx) 5036320SbhollerL(P0QC): 5046320Sbholler mov -0x60(%rdx),%r9 5056320Sbholler mov %r9,-0x60(%rcx) 5066320SbhollerL(P0QB): 5076320Sbholler mov -0x58(%rdx),%r10 5086320Sbholler mov %r10,-0x58(%rcx) 5096320SbhollerL(P0QA): 5106320Sbholler mov -0x50(%rdx),%r9 5116320Sbholler mov %r9,-0x50(%rcx) 5126320SbhollerL(P0Q9): 5136320Sbholler mov -0x48(%rdx),%r10 5146320Sbholler mov %r10,-0x48(%rcx) 5156320SbhollerL(P0Q8): 5166320Sbholler mov -0x40(%rdx),%r9 5176320Sbholler mov %r9,-0x40(%rcx) 5186320SbhollerL(P0Q7): 5196320Sbholler mov -0x38(%rdx),%r10 5206320Sbholler mov %r10,-0x38(%rcx) 5216320SbhollerL(P0Q6): 5226320Sbholler mov -0x30(%rdx),%r9 5236320Sbholler mov %r9,-0x30(%rcx) 5246320SbhollerL(P0Q5): 5256320Sbholler mov -0x28(%rdx),%r10 5266320Sbholler mov %r10,-0x28(%rcx) 5276320SbhollerL(P0Q4): 5286320Sbholler mov -0x20(%rdx),%r9 5296320Sbholler mov %r9,-0x20(%rcx) 5306320SbhollerL(P0Q3): 5316320Sbholler mov -0x18(%rdx),%r10 5326320Sbholler mov %r10,-0x18(%rcx) 5336320SbhollerL(P0Q2): 5346320Sbholler mov -0x10(%rdx),%r9 5356320Sbholler mov %r9,-0x10(%rcx) 5366320SbhollerL(P0Q1): 5376320Sbholler mov -0x8(%rdx),%r10 5386320Sbholler mov %r10,-0x8(%rcx) 5396320SbhollerL(P0Q0): 5406320Sbholler ret 5416320Sbholler 5426320Sbholler .balign 16 5436320SbhollerL(P1QF): 5446320Sbholler mov -0x79(%rdx),%r9 5456320Sbholler mov %r9,-0x79(%rcx) 5466320SbhollerL(P1QE): 5476320Sbholler mov -0x71(%rdx),%r11 5486320Sbholler mov %r11,-0x71(%rcx) 5496320SbhollerL(P1QD): 5506320Sbholler mov -0x69(%rdx),%r10 5516320Sbholler mov %r10,-0x69(%rcx) 5526320SbhollerL(P1QC): 5536320Sbholler mov -0x61(%rdx),%r9 5546320Sbholler mov %r9,-0x61(%rcx) 5556320SbhollerL(P1QB): 5566320Sbholler mov -0x59(%rdx),%r11 5576320Sbholler mov %r11,-0x59(%rcx) 5586320SbhollerL(P1QA): 5596320Sbholler mov -0x51(%rdx),%r10 5606320Sbholler mov %r10,-0x51(%rcx) 5616320SbhollerL(P1Q9): 5626320Sbholler mov -0x49(%rdx),%r9 5636320Sbholler mov %r9,-0x49(%rcx) 5646320SbhollerL(P1Q8): 5656320Sbholler mov -0x41(%rdx),%r11 5666320Sbholler mov %r11,-0x41(%rcx) 5676320SbhollerL(P1Q7): 5686320Sbholler mov -0x39(%rdx),%r10 5696320Sbholler mov %r10,-0x39(%rcx) 5706320SbhollerL(P1Q6): 5716320Sbholler mov -0x31(%rdx),%r9 5726320Sbholler mov %r9,-0x31(%rcx) 5736320SbhollerL(P1Q5): 5746320Sbholler mov -0x29(%rdx),%r11 5756320Sbholler mov %r11,-0x29(%rcx) 5766320SbhollerL(P1Q4): 5776320Sbholler mov -0x21(%rdx),%r10 5786320Sbholler mov %r10,-0x21(%rcx) 5796320SbhollerL(P1Q3): 5806320Sbholler mov -0x19(%rdx),%r9 5816320Sbholler mov %r9,-0x19(%rcx) 5826320SbhollerL(P1Q2): 5836320Sbholler mov -0x11(%rdx),%r11 5846320Sbholler mov %r11,-0x11(%rcx) 5856320SbhollerL(P1Q1): 5866320Sbholler mov -0x9(%rdx),%r10 5876320Sbholler mov %r10,-0x9(%rcx) 5886320SbhollerL(P1Q0): 5896320Sbholler movzbq -0x1(%rdx),%r9 5906320Sbholler mov %r9b,-0x1(%rcx) 5916320Sbholler ret 5926320Sbholler 5936320Sbholler .balign 16 5946320SbhollerL(P2QF): 5956320Sbholler mov -0x7a(%rdx),%r9 5966320Sbholler mov %r9,-0x7a(%rcx) 5976320SbhollerL(P2QE): 5986320Sbholler mov -0x72(%rdx),%r11 5996320Sbholler mov %r11,-0x72(%rcx) 6006320SbhollerL(P2QD): 6016320Sbholler mov -0x6a(%rdx),%r10 6026320Sbholler mov %r10,-0x6a(%rcx) 6036320SbhollerL(P2QC): 6046320Sbholler mov -0x62(%rdx),%r9 6056320Sbholler mov %r9,-0x62(%rcx) 6066320SbhollerL(P2QB): 6076320Sbholler mov -0x5a(%rdx),%r11 6086320Sbholler mov %r11,-0x5a(%rcx) 6096320SbhollerL(P2QA): 6106320Sbholler mov -0x52(%rdx),%r10 6116320Sbholler mov %r10,-0x52(%rcx) 6126320SbhollerL(P2Q9): 6136320Sbholler mov -0x4a(%rdx),%r9 6146320Sbholler mov %r9,-0x4a(%rcx) 6156320SbhollerL(P2Q8): 6166320Sbholler mov -0x42(%rdx),%r11 6176320Sbholler mov %r11,-0x42(%rcx) 6186320SbhollerL(P2Q7): 6196320Sbholler mov -0x3a(%rdx),%r10 6206320Sbholler mov %r10,-0x3a(%rcx) 6216320SbhollerL(P2Q6): 6226320Sbholler mov -0x32(%rdx),%r9 6236320Sbholler mov %r9,-0x32(%rcx) 6246320SbhollerL(P2Q5): 6256320Sbholler mov -0x2a(%rdx),%r11 6266320Sbholler mov %r11,-0x2a(%rcx) 6276320SbhollerL(P2Q4): 6286320Sbholler mov -0x22(%rdx),%r10 6296320Sbholler mov %r10,-0x22(%rcx) 6306320SbhollerL(P2Q3): 6316320Sbholler mov -0x1a(%rdx),%r9 6326320Sbholler mov %r9,-0x1a(%rcx) 6336320SbhollerL(P2Q2): 6346320Sbholler mov -0x12(%rdx),%r11 6356320Sbholler mov %r11,-0x12(%rcx) 6366320SbhollerL(P2Q1): 6376320Sbholler mov -0xa(%rdx),%r10 6386320Sbholler mov %r10,-0xa(%rcx) 6396320SbhollerL(P2Q0): 6406320Sbholler movzwq -0x2(%rdx),%r9 6416320Sbholler mov %r9w,-0x2(%rcx) 6426320Sbholler ret 6436320Sbholler 6446320Sbholler .balign 16 6456320SbhollerL(P3QF): 6466320Sbholler mov -0x7b(%rdx),%r9 6476320Sbholler mov %r9,-0x7b(%rcx) 6486320SbhollerL(P3QE): 6496320Sbholler mov -0x73(%rdx),%r11 6506320Sbholler mov %r11,-0x73(%rcx) 6516320SbhollerL(P3QD): 6526320Sbholler mov -0x6b(%rdx),%r10 6536320Sbholler mov %r10,-0x6b(%rcx) 6546320SbhollerL(P3QC): 6556320Sbholler mov -0x63(%rdx),%r9 6566320Sbholler mov %r9,-0x63(%rcx) 6576320SbhollerL(P3QB): 6586320Sbholler mov -0x5b(%rdx),%r11 6596320Sbholler mov %r11,-0x5b(%rcx) 6606320SbhollerL(P3QA): 6616320Sbholler mov -0x53(%rdx),%r10 6626320Sbholler mov %r10,-0x53(%rcx) 6636320SbhollerL(P3Q9): 6646320Sbholler mov -0x4b(%rdx),%r9 6656320Sbholler mov %r9,-0x4b(%rcx) 6666320SbhollerL(P3Q8): 6676320Sbholler mov -0x43(%rdx),%r11 6686320Sbholler mov %r11,-0x43(%rcx) 6696320SbhollerL(P3Q7): 6706320Sbholler mov -0x3b(%rdx),%r10 6716320Sbholler mov %r10,-0x3b(%rcx) 6726320SbhollerL(P3Q6): 6736320Sbholler mov -0x33(%rdx),%r9 6746320Sbholler mov %r9,-0x33(%rcx) 6756320SbhollerL(P3Q5): 6766320Sbholler mov -0x2b(%rdx),%r11 6776320Sbholler mov %r11,-0x2b(%rcx) 6786320SbhollerL(P3Q4): 6796320Sbholler mov -0x23(%rdx),%r10 6806320Sbholler mov %r10,-0x23(%rcx) 6816320SbhollerL(P3Q3): 6826320Sbholler mov -0x1b(%rdx),%r9 6836320Sbholler mov %r9,-0x1b(%rcx) 6846320SbhollerL(P3Q2): 6856320Sbholler mov -0x13(%rdx),%r11 6866320Sbholler mov %r11,-0x13(%rcx) 6876320SbhollerL(P3Q1): 6886320Sbholler mov -0xb(%rdx),%r10 6896320Sbholler mov %r10,-0xb(%rcx) 6906320Sbholler /* 6916320Sbholler * These trailing loads/stores have to do all their loads 1st, 6926320Sbholler * then do the stores. 6936320Sbholler */ 6946320SbhollerL(P3Q0): 6956320Sbholler movzwq -0x3(%rdx),%r9 6966320Sbholler movzbq -0x1(%rdx),%r10 6976320Sbholler mov %r9w,-0x3(%rcx) 6986320Sbholler mov %r10b,-0x1(%rcx) 6996320Sbholler ret 7006320Sbholler 7016320Sbholler .balign 16 7026320SbhollerL(P4QF): 7036320Sbholler mov -0x7c(%rdx),%r9 7046320Sbholler mov %r9,-0x7c(%rcx) 7056320SbhollerL(P4QE): 7066320Sbholler mov -0x74(%rdx),%r11 7076320Sbholler mov %r11,-0x74(%rcx) 7086320SbhollerL(P4QD): 7096320Sbholler mov -0x6c(%rdx),%r10 7106320Sbholler mov %r10,-0x6c(%rcx) 7116320SbhollerL(P4QC): 7126320Sbholler mov -0x64(%rdx),%r9 7136320Sbholler mov %r9,-0x64(%rcx) 7146320SbhollerL(P4QB): 7156320Sbholler mov -0x5c(%rdx),%r11 7166320Sbholler mov %r11,-0x5c(%rcx) 7176320SbhollerL(P4QA): 7186320Sbholler mov -0x54(%rdx),%r10 7196320Sbholler mov %r10,-0x54(%rcx) 7206320SbhollerL(P4Q9): 7216320Sbholler mov -0x4c(%rdx),%r9 7226320Sbholler mov %r9,-0x4c(%rcx) 7236320SbhollerL(P4Q8): 7246320Sbholler mov -0x44(%rdx),%r11 7256320Sbholler mov %r11,-0x44(%rcx) 7266320SbhollerL(P4Q7): 7276320Sbholler mov -0x3c(%rdx),%r10 7286320Sbholler mov %r10,-0x3c(%rcx) 7296320SbhollerL(P4Q6): 7306320Sbholler mov -0x34(%rdx),%r9 7316320Sbholler mov %r9,-0x34(%rcx) 7326320SbhollerL(P4Q5): 7336320Sbholler mov -0x2c(%rdx),%r11 7346320Sbholler mov %r11,-0x2c(%rcx) 7356320SbhollerL(P4Q4): 7366320Sbholler mov -0x24(%rdx),%r10 7376320Sbholler mov %r10,-0x24(%rcx) 7386320SbhollerL(P4Q3): 7396320Sbholler mov -0x1c(%rdx),%r9 7406320Sbholler mov %r9,-0x1c(%rcx) 7416320SbhollerL(P4Q2): 7426320Sbholler mov -0x14(%rdx),%r11 7436320Sbholler mov %r11,-0x14(%rcx) 7446320SbhollerL(P4Q1): 7456320Sbholler mov -0xc(%rdx),%r10 7466320Sbholler mov %r10,-0xc(%rcx) 7476320SbhollerL(P4Q0): 7486320Sbholler mov -0x4(%rdx),%r9d 7496320Sbholler mov %r9d,-0x4(%rcx) 7506320Sbholler ret 7516320Sbholler 7526320Sbholler .balign 16 7536320SbhollerL(P5QF): 7546320Sbholler mov -0x7d(%rdx),%r9 7556320Sbholler mov %r9,-0x7d(%rcx) 7566320SbhollerL(P5QE): 7576320Sbholler mov -0x75(%rdx),%r11 7586320Sbholler mov %r11,-0x75(%rcx) 7596320SbhollerL(P5QD): 7606320Sbholler mov -0x6d(%rdx),%r10 7616320Sbholler mov %r10,-0x6d(%rcx) 7626320SbhollerL(P5QC): 7636320Sbholler mov -0x65(%rdx),%r9 7646320Sbholler mov %r9,-0x65(%rcx) 7656320SbhollerL(P5QB): 7666320Sbholler mov -0x5d(%rdx),%r11 7676320Sbholler mov %r11,-0x5d(%rcx) 7686320SbhollerL(P5QA): 7696320Sbholler mov -0x55(%rdx),%r10 7706320Sbholler mov %r10,-0x55(%rcx) 7716320SbhollerL(P5Q9): 7726320Sbholler mov -0x4d(%rdx),%r9 7736320Sbholler mov %r9,-0x4d(%rcx) 7746320SbhollerL(P5Q8): 7756320Sbholler mov -0x45(%rdx),%r11 7766320Sbholler mov %r11,-0x45(%rcx) 7776320SbhollerL(P5Q7): 7786320Sbholler mov -0x3d(%rdx),%r10 7796320Sbholler mov %r10,-0x3d(%rcx) 7806320SbhollerL(P5Q6): 7816320Sbholler mov -0x35(%rdx),%r9 7826320Sbholler mov %r9,-0x35(%rcx) 7836320SbhollerL(P5Q5): 7846320Sbholler mov -0x2d(%rdx),%r11 7856320Sbholler mov %r11,-0x2d(%rcx) 7866320SbhollerL(P5Q4): 7876320Sbholler mov -0x25(%rdx),%r10 7886320Sbholler mov %r10,-0x25(%rcx) 7896320SbhollerL(P5Q3): 7906320Sbholler mov -0x1d(%rdx),%r9 7916320Sbholler mov %r9,-0x1d(%rcx) 7926320SbhollerL(P5Q2): 7936320Sbholler mov -0x15(%rdx),%r11 7946320Sbholler mov %r11,-0x15(%rcx) 7956320SbhollerL(P5Q1): 7966320Sbholler mov -0xd(%rdx),%r10 7976320Sbholler mov %r10,-0xd(%rcx) 7986320Sbholler /* 7996320Sbholler * These trailing loads/stores have to do all their loads 1st, 8006320Sbholler * then do the stores. 8016320Sbholler */ 8026320SbhollerL(P5Q0): 8036320Sbholler mov -0x5(%rdx),%r9d 8046320Sbholler movzbq -0x1(%rdx),%r10 8056320Sbholler mov %r9d,-0x5(%rcx) 8066320Sbholler mov %r10b,-0x1(%rcx) 8076320Sbholler ret 8086320Sbholler 8096320Sbholler .balign 16 8106320SbhollerL(P6QF): 8116320Sbholler mov -0x7e(%rdx),%r9 8126320Sbholler mov %r9,-0x7e(%rcx) 8136320SbhollerL(P6QE): 8146320Sbholler mov -0x76(%rdx),%r11 8156320Sbholler mov %r11,-0x76(%rcx) 8166320SbhollerL(P6QD): 8176320Sbholler mov -0x6e(%rdx),%r10 8186320Sbholler mov %r10,-0x6e(%rcx) 8196320SbhollerL(P6QC): 8206320Sbholler mov -0x66(%rdx),%r9 8216320Sbholler mov %r9,-0x66(%rcx) 8226320SbhollerL(P6QB): 8236320Sbholler mov -0x5e(%rdx),%r11 8246320Sbholler mov %r11,-0x5e(%rcx) 8256320SbhollerL(P6QA): 8266320Sbholler mov -0x56(%rdx),%r10 8276320Sbholler mov %r10,-0x56(%rcx) 8286320SbhollerL(P6Q9): 8296320Sbholler mov -0x4e(%rdx),%r9 8306320Sbholler mov %r9,-0x4e(%rcx) 8316320SbhollerL(P6Q8): 8326320Sbholler mov -0x46(%rdx),%r11 8336320Sbholler mov %r11,-0x46(%rcx) 8346320SbhollerL(P6Q7): 8356320Sbholler mov -0x3e(%rdx),%r10 8366320Sbholler mov %r10,-0x3e(%rcx) 8376320SbhollerL(P6Q6): 8386320Sbholler mov -0x36(%rdx),%r9 8396320Sbholler mov %r9,-0x36(%rcx) 8406320SbhollerL(P6Q5): 8416320Sbholler mov -0x2e(%rdx),%r11 8426320Sbholler mov %r11,-0x2e(%rcx) 8436320SbhollerL(P6Q4): 8446320Sbholler mov -0x26(%rdx),%r10 8456320Sbholler mov %r10,-0x26(%rcx) 8466320SbhollerL(P6Q3): 8476320Sbholler mov -0x1e(%rdx),%r9 8486320Sbholler mov %r9,-0x1e(%rcx) 8496320SbhollerL(P6Q2): 8506320Sbholler mov -0x16(%rdx),%r11 8516320Sbholler mov %r11,-0x16(%rcx) 8526320SbhollerL(P6Q1): 8536320Sbholler mov -0xe(%rdx),%r10 8546320Sbholler mov %r10,-0xe(%rcx) 8556320Sbholler /* 8566320Sbholler * These trailing loads/stores have to do all their loads 1st, 8576320Sbholler * then do the stores. 8586320Sbholler */ 8596320SbhollerL(P6Q0): 8606320Sbholler mov -0x6(%rdx),%r9d 8616320Sbholler movzwq -0x2(%rdx),%r10 8626320Sbholler mov %r9d,-0x6(%rcx) 8636320Sbholler mov %r10w,-0x2(%rcx) 8646320Sbholler ret 8656320Sbholler 8666320Sbholler .balign 16 8676320SbhollerL(P7QF): 8686320Sbholler mov -0x7f(%rdx),%r9 8696320Sbholler mov %r9,-0x7f(%rcx) 8706320SbhollerL(P7QE): 8716320Sbholler mov -0x77(%rdx),%r11 8726320Sbholler mov %r11,-0x77(%rcx) 8736320SbhollerL(P7QD): 8746320Sbholler mov -0x6f(%rdx),%r10 8756320Sbholler mov %r10,-0x6f(%rcx) 8766320SbhollerL(P7QC): 8776320Sbholler mov -0x67(%rdx),%r9 8786320Sbholler mov %r9,-0x67(%rcx) 8796320SbhollerL(P7QB): 8806320Sbholler mov -0x5f(%rdx),%r11 8816320Sbholler mov %r11,-0x5f(%rcx) 8826320SbhollerL(P7QA): 8836320Sbholler mov -0x57(%rdx),%r10 8846320Sbholler mov %r10,-0x57(%rcx) 8856320SbhollerL(P7Q9): 8866320Sbholler mov -0x4f(%rdx),%r9 8876320Sbholler mov %r9,-0x4f(%rcx) 8886320SbhollerL(P7Q8): 8896320Sbholler mov -0x47(%rdx),%r11 8906320Sbholler mov %r11,-0x47(%rcx) 8916320SbhollerL(P7Q7): 8926320Sbholler mov -0x3f(%rdx),%r10 8936320Sbholler mov %r10,-0x3f(%rcx) 8946320SbhollerL(P7Q6): 8956320Sbholler mov -0x37(%rdx),%r9 8966320Sbholler mov %r9,-0x37(%rcx) 8976320SbhollerL(P7Q5): 8986320Sbholler mov -0x2f(%rdx),%r11 8996320Sbholler mov %r11,-0x2f(%rcx) 9006320SbhollerL(P7Q4): 9016320Sbholler mov -0x27(%rdx),%r10 9026320Sbholler mov %r10,-0x27(%rcx) 9036320SbhollerL(P7Q3): 9046320Sbholler mov -0x1f(%rdx),%r9 9056320Sbholler mov %r9,-0x1f(%rcx) 9066320SbhollerL(P7Q2): 9076320Sbholler mov -0x17(%rdx),%r11 9086320Sbholler mov %r11,-0x17(%rcx) 9096320SbhollerL(P7Q1): 9106320Sbholler mov -0xf(%rdx),%r10 9116320Sbholler mov %r10,-0xf(%rcx) 9126320Sbholler /* 9136320Sbholler * These trailing loads/stores have to do all their loads 1st, 9146320Sbholler * then do the stores. 9156320Sbholler */ 9166320SbhollerL(P7Q0): 9176320Sbholler mov -0x7(%rdx),%r9d 9186320Sbholler movzwq -0x3(%rdx),%r10 9196320Sbholler movzbq -0x1(%rdx),%r11 9206320Sbholler mov %r9d,-0x7(%rcx) 9216320Sbholler mov %r10w,-0x3(%rcx) 9226320Sbholler mov %r11b,-0x1(%rcx) 9236320Sbholler ret 9246320Sbholler 9256320Sbholler .balign 16 9266320SbhollerL(ck_use_sse2): 9276320Sbholler /* 9286320Sbholler * Align dest to 16 byte boundary. 9296320Sbholler */ 9306320Sbholler test $0xf,%rcx 9316320Sbholler jnz L(ShrtAlignNew) 9326320Sbholler 9336320SbhollerL(now_qw_aligned): 9346320Sbholler cmpl $NO_SSE,.memops_method(%rip) 9356320Sbholler je L(Loop8byte_pre) 9366320Sbholler 9376320Sbholler /* 9386320Sbholler * The fall-through path is to do SSE2 16-byte load/stores 9396320Sbholler */ 9406320Sbholler 9416320Sbholler /* 9426320Sbholler * If current move size is larger than half of the highest level cache 9436320Sbholler * size, then do non-temporal moves. 9446320Sbholler */ 9456320Sbholler mov .largest_level_cache_size(%rip),%r9d 9466320Sbholler shr %r9 # take half of it 9476320Sbholler cmp %r9,%r8 9486320Sbholler jg L(sse2_nt_move) 9496320Sbholler 9506320Sbholler /* 9516320Sbholler * If both the source and dest are aligned, then use the both aligned 9526320Sbholler * logic. Well aligned data should reap the rewards. 9536320Sbholler */ 9546320Sbholler test $0xf,%rdx 9556320Sbholler jz L(pre_both_aligned) 9566320Sbholler 9576320Sbholler lea L(SSE_src)(%rip),%r10 # SSE2 (default) 9586320Sbholler testl $USE_SSSE3,.memops_method(%rip) 9596320Sbholler jz 1f 9606320Sbholler lea L(SSSE3_src)(%rip),%r10 # SSSE3 9616320Sbholler 9626320Sbholler1: 9636320Sbholler /* 9646320Sbholler * if the src is not 16 byte aligned... 9656320Sbholler */ 9666320Sbholler mov %rdx,%r11 9676320Sbholler and $0xf,%r11 9686320Sbholler movdqu (%rdx),%xmm0 9696320Sbholler movdqa %xmm0,(%rcx) 9706320Sbholler add $0x10,%rdx 9716320Sbholler sub %r11,%rdx 9726320Sbholler add $0x10,%rcx 9736320Sbholler sub $0x10,%r8 9746320Sbholler movdqa (%rdx),%xmm1 9756320Sbholler 9766320Sbholler movslq (%r10,%r11,4),%r9 9776320Sbholler lea (%r9,%r10,1),%r10 9786320Sbholler jmpq *%r10 9796320Sbholler 9806320Sbholler .balign 16 9816320SbhollerL(SSSE3_src): .int L(pre_both_aligned)-L(SSSE3_src) 9826320Sbholler .int L(mov3dqa1) -L(SSSE3_src) 9836320Sbholler .int L(mov3dqa2) -L(SSSE3_src) 9846320Sbholler .int L(mov3dqa3) -L(SSSE3_src) 9856320Sbholler .int L(mov3dqa4) -L(SSSE3_src) 9866320Sbholler .int L(mov3dqa5) -L(SSSE3_src) 9876320Sbholler .int L(mov3dqa6) -L(SSSE3_src) 9886320Sbholler .int L(mov3dqa7) -L(SSSE3_src) 9896320Sbholler .int L(movdqa8) -L(SSSE3_src) 9906320Sbholler .int L(mov3dqa9) -L(SSSE3_src) 9916320Sbholler .int L(mov3dqa10)-L(SSSE3_src) 9926320Sbholler .int L(mov3dqa11)-L(SSSE3_src) 9936320Sbholler .int L(mov3dqa12)-L(SSSE3_src) 9946320Sbholler .int L(mov3dqa13)-L(SSSE3_src) 9956320Sbholler .int L(mov3dqa14)-L(SSSE3_src) 9966320Sbholler .int L(mov3dqa15)-L(SSSE3_src) 9976320SbhollerL(SSE_src): .int L(pre_both_aligned)-L(SSE_src) 9986320Sbholler .int L(movdqa1) -L(SSE_src) 9996320Sbholler .int L(movdqa2) -L(SSE_src) 10006320Sbholler .int L(movdqa3) -L(SSE_src) 10016320Sbholler .int L(movdqa4) -L(SSE_src) 10026320Sbholler .int L(movdqa5) -L(SSE_src) 10036320Sbholler .int L(movdqa6) -L(SSE_src) 10046320Sbholler .int L(movdqa7) -L(SSE_src) 10056320Sbholler .int L(movdqa8) -L(SSE_src) 10066320Sbholler .int L(movdqa9) -L(SSE_src) 10076320Sbholler .int L(movdqa10)-L(SSE_src) 10086320Sbholler .int L(movdqa11)-L(SSE_src) 10096320Sbholler .int L(movdqa12)-L(SSE_src) 10106320Sbholler .int L(movdqa13)-L(SSE_src) 10116320Sbholler .int L(movdqa14)-L(SSE_src) 10126320Sbholler .int L(movdqa15)-L(SSE_src) 10136320Sbholler 10146320Sbholler .balign 16 10156320SbhollerL(movdqa1): 10166320Sbholler movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 10176320Sbholler movdqa 0x20(%rdx),%xmm0 # load the upper source buffer 10186320Sbholler lea 0x20(%rdx),%rdx 10196320Sbholler lea -0x20(%r8),%r8 10206320Sbholler 10216320Sbholler psrldq $0x1,%xmm1 # shift right prev buffer (saved from last iteration) 10226320Sbholler movdqa %xmm3,%xmm2 # store off xmm reg for use next iteration 10236320Sbholler pslldq $0xf,%xmm3 # shift the current buffer left (shift in zeros) 10246320Sbholler por %xmm1,%xmm3 # OR them together 10256320Sbholler cmp $0x20,%r8 10266320Sbholler 10276320Sbholler psrldq $0x1,%xmm2 # shift right prev buffer (saved from last iteration) 10286320Sbholler movdqa %xmm0,%xmm1 # store off xmm reg for use next iteration 10296320Sbholler pslldq $0xf,%xmm0 # shift the current buffer left (shift in zeros) 10306320Sbholler por %xmm2,%xmm0 # OR them together 10316320Sbholler movdqa %xmm3,(%rcx) # store it 10326320Sbholler movdqa %xmm0,0x10(%rcx) # store it 10336320Sbholler lea 0x20(%rcx),%rcx 10346320Sbholler 10356320Sbholler jge L(movdqa1) 10366320Sbholler jmp L(movdqa_epi) 10376320Sbholler 10386320Sbholler .balign 16 10396320SbhollerL(movdqa2): 10406320Sbholler sub $0x20,%r8 10416320Sbholler movdqa 0x10(%rdx),%xmm3 10426320Sbholler movdqa 0x20(%rdx),%xmm0 10436320Sbholler add $0x20,%rdx 10446320Sbholler 10456320Sbholler psrldq $0x2,%xmm1 10466320Sbholler movdqa %xmm3,%xmm2 10476320Sbholler pslldq $0xe,%xmm3 10486320Sbholler por %xmm1,%xmm3 10496320Sbholler 10506320Sbholler psrldq $0x2,%xmm2 10516320Sbholler movdqa %xmm0,%xmm1 10526320Sbholler pslldq $0xe,%xmm0 10536320Sbholler por %xmm2,%xmm0 10546320Sbholler movdqa %xmm3,(%rcx) 10556320Sbholler movdqa %xmm0,0x10(%rcx) 10566320Sbholler 10576320Sbholler add $0x20,%rcx 10586320Sbholler cmp $0x20,%r8 10596320Sbholler jge L(movdqa2) 10606320Sbholler jmp L(movdqa_epi) 10616320Sbholler 10626320Sbholler .balign 16 10636320SbhollerL(movdqa3): 10646320Sbholler sub $0x20,%r8 10656320Sbholler movdqa 0x10(%rdx),%xmm3 10666320Sbholler movdqa 0x20(%rdx),%xmm0 10676320Sbholler add $0x20,%rdx 10686320Sbholler 10696320Sbholler psrldq $0x3,%xmm1 10706320Sbholler movdqa %xmm3,%xmm2 10716320Sbholler pslldq $0xd,%xmm3 10726320Sbholler por %xmm1,%xmm3 10736320Sbholler 10746320Sbholler psrldq $0x3,%xmm2 10756320Sbholler movdqa %xmm0,%xmm1 10766320Sbholler pslldq $0xd,%xmm0 10776320Sbholler por %xmm2,%xmm0 10786320Sbholler movdqa %xmm3,(%rcx) 10796320Sbholler movdqa %xmm0,0x10(%rcx) 10806320Sbholler 10816320Sbholler add $0x20,%rcx 10826320Sbholler cmp $0x20,%r8 10836320Sbholler jge L(movdqa3) 10846320Sbholler jmp L(movdqa_epi) 10856320Sbholler 10866320Sbholler .balign 16 10876320SbhollerL(movdqa4): 10886320Sbholler sub $0x20,%r8 10896320Sbholler movdqa 0x10(%rdx),%xmm3 10906320Sbholler movdqa 0x20(%rdx),%xmm0 10916320Sbholler add $0x20,%rdx 10926320Sbholler 10936320Sbholler psrldq $0x4,%xmm1 10946320Sbholler movdqa %xmm3,%xmm2 10956320Sbholler pslldq $0xc,%xmm3 10966320Sbholler por %xmm1,%xmm3 10976320Sbholler 10986320Sbholler psrldq $0x4,%xmm2 10996320Sbholler movdqa %xmm0,%xmm1 11006320Sbholler pslldq $0xc,%xmm0 11016320Sbholler por %xmm2,%xmm0 11026320Sbholler 11036320Sbholler movdqa %xmm3,(%rcx) 11046320Sbholler movdqa %xmm0,0x10(%rcx) 11056320Sbholler 11066320Sbholler add $0x20,%rcx 11076320Sbholler cmp $0x20,%r8 11086320Sbholler jge L(movdqa4) 11096320Sbholler jmp L(movdqa_epi) 11106320Sbholler 11116320Sbholler .balign 16 11126320SbhollerL(movdqa5): 11136320Sbholler sub $0x20,%r8 11146320Sbholler movdqa 0x10(%rdx),%xmm3 11156320Sbholler movdqa 0x20(%rdx),%xmm0 11166320Sbholler add $0x20,%rdx 11176320Sbholler 11186320Sbholler psrldq $0x5,%xmm1 11196320Sbholler movdqa %xmm3,%xmm2 11206320Sbholler pslldq $0xb,%xmm3 11216320Sbholler por %xmm1,%xmm3 11226320Sbholler 11236320Sbholler psrldq $0x5,%xmm2 11246320Sbholler movdqa %xmm0,%xmm1 11256320Sbholler pslldq $0xb,%xmm0 11266320Sbholler por %xmm2,%xmm0 11276320Sbholler 11286320Sbholler movdqa %xmm3,(%rcx) 11296320Sbholler movdqa %xmm0,0x10(%rcx) 11306320Sbholler 11316320Sbholler add $0x20,%rcx 11326320Sbholler cmp $0x20,%r8 11336320Sbholler jge L(movdqa5) 11346320Sbholler jmp L(movdqa_epi) 11356320Sbholler 11366320Sbholler .balign 16 11376320SbhollerL(movdqa6): 11386320Sbholler sub $0x20,%r8 11396320Sbholler movdqa 0x10(%rdx),%xmm3 11406320Sbholler movdqa 0x20(%rdx),%xmm0 11416320Sbholler add $0x20,%rdx 11426320Sbholler 11436320Sbholler psrldq $0x6,%xmm1 11446320Sbholler movdqa %xmm3,%xmm2 11456320Sbholler pslldq $0xa,%xmm3 11466320Sbholler por %xmm1,%xmm3 11476320Sbholler 11486320Sbholler psrldq $0x6,%xmm2 11496320Sbholler movdqa %xmm0,%xmm1 11506320Sbholler pslldq $0xa,%xmm0 11516320Sbholler por %xmm2,%xmm0 11526320Sbholler movdqa %xmm3,(%rcx) 11536320Sbholler movdqa %xmm0,0x10(%rcx) 11546320Sbholler 11556320Sbholler add $0x20,%rcx 11566320Sbholler cmp $0x20,%r8 11576320Sbholler jge L(movdqa6) 11586320Sbholler jmp L(movdqa_epi) 11596320Sbholler 11606320Sbholler .balign 16 11616320SbhollerL(movdqa7): 11626320Sbholler sub $0x20,%r8 11636320Sbholler movdqa 0x10(%rdx),%xmm3 11646320Sbholler movdqa 0x20(%rdx),%xmm0 11656320Sbholler add $0x20,%rdx 11666320Sbholler 11676320Sbholler psrldq $0x7,%xmm1 11686320Sbholler movdqa %xmm3,%xmm2 11696320Sbholler pslldq $0x9,%xmm3 11706320Sbholler por %xmm1,%xmm3 11716320Sbholler 11726320Sbholler psrldq $0x7,%xmm2 11736320Sbholler movdqa %xmm0,%xmm1 11746320Sbholler pslldq $0x9,%xmm0 11756320Sbholler por %xmm2,%xmm0 11766320Sbholler movdqa %xmm3,(%rcx) 11776320Sbholler movdqa %xmm0,0x10(%rcx) 11786320Sbholler 11796320Sbholler add $0x20,%rcx 11806320Sbholler cmp $0x20,%r8 11816320Sbholler jge L(movdqa7) 11826320Sbholler jmp L(movdqa_epi) 11836320Sbholler 11846320Sbholler .balign 16 11856320SbhollerL(movdqa8): 11866320Sbholler movdqa 0x10(%rdx),%xmm3 11876320Sbholler sub $0x30,%r8 11886320Sbholler movdqa 0x20(%rdx),%xmm0 11896320Sbholler movdqa 0x30(%rdx),%xmm5 11906320Sbholler lea 0x30(%rdx),%rdx 11916320Sbholler 11926320Sbholler shufpd $0x1,%xmm3,%xmm1 11936320Sbholler movdqa %xmm1,(%rcx) 11946320Sbholler 11956320Sbholler cmp $0x30,%r8 11966320Sbholler 11976320Sbholler shufpd $0x1,%xmm0,%xmm3 11986320Sbholler movdqa %xmm3,0x10(%rcx) 11996320Sbholler 12006320Sbholler movdqa %xmm5,%xmm1 12016320Sbholler shufpd $0x1,%xmm5,%xmm0 12026320Sbholler movdqa %xmm0,0x20(%rcx) 12036320Sbholler 12046320Sbholler lea 0x30(%rcx),%rcx 12056320Sbholler 12066320Sbholler jge L(movdqa8) 12076320Sbholler jmp L(movdqa_epi) 12086320Sbholler 12096320Sbholler .balign 16 12106320SbhollerL(movdqa9): 12116320Sbholler sub $0x20,%r8 12126320Sbholler movdqa 0x10(%rdx),%xmm3 12136320Sbholler movdqa 0x20(%rdx),%xmm0 12146320Sbholler add $0x20,%rdx 12156320Sbholler 12166320Sbholler psrldq $0x9,%xmm1 12176320Sbholler movdqa %xmm3,%xmm2 12186320Sbholler pslldq $0x7,%xmm3 12196320Sbholler por %xmm1,%xmm3 12206320Sbholler 12216320Sbholler psrldq $0x9,%xmm2 12226320Sbholler movdqa %xmm0,%xmm1 12236320Sbholler pslldq $0x7,%xmm0 12246320Sbholler por %xmm2,%xmm0 12256320Sbholler movdqa %xmm3,(%rcx) 12266320Sbholler movdqa %xmm0,0x10(%rcx) 12276320Sbholler 12286320Sbholler add $0x20,%rcx 12296320Sbholler cmp $0x20,%r8 12306320Sbholler jge L(movdqa9) 12316320Sbholler jmp L(movdqa_epi) 12326320Sbholler 12336320Sbholler .balign 16 12346320SbhollerL(movdqa10): 12356320Sbholler sub $0x20,%r8 12366320Sbholler movdqa 0x10(%rdx),%xmm3 12376320Sbholler movdqa 0x20(%rdx),%xmm0 12386320Sbholler add $0x20,%rdx 12396320Sbholler 12406320Sbholler psrldq $0xa,%xmm1 12416320Sbholler movdqa %xmm3,%xmm2 12426320Sbholler pslldq $0x6,%xmm3 12436320Sbholler por %xmm1,%xmm3 12446320Sbholler 12456320Sbholler psrldq $0xa,%xmm2 12466320Sbholler movdqa %xmm0,%xmm1 12476320Sbholler pslldq $0x6,%xmm0 12486320Sbholler por %xmm2,%xmm0 12496320Sbholler movdqa %xmm3,(%rcx) 12506320Sbholler movdqa %xmm0,0x10(%rcx) 12516320Sbholler 12526320Sbholler add $0x20,%rcx 12536320Sbholler cmp $0x20,%r8 12546320Sbholler jge L(movdqa10) 12556320Sbholler jmp L(movdqa_epi) 12566320Sbholler 12576320Sbholler .balign 16 12586320SbhollerL(movdqa11): 12596320Sbholler sub $0x20,%r8 12606320Sbholler movdqa 0x10(%rdx),%xmm3 12616320Sbholler movdqa 0x20(%rdx),%xmm0 12626320Sbholler add $0x20,%rdx 12636320Sbholler 12646320Sbholler psrldq $0xb,%xmm1 12656320Sbholler movdqa %xmm3,%xmm2 12666320Sbholler pslldq $0x5,%xmm3 12676320Sbholler por %xmm1,%xmm3 12686320Sbholler 12696320Sbholler psrldq $0xb,%xmm2 12706320Sbholler movdqa %xmm0,%xmm1 12716320Sbholler pslldq $0x5,%xmm0 12726320Sbholler por %xmm2,%xmm0 12736320Sbholler movdqa %xmm3,(%rcx) 12746320Sbholler movdqa %xmm0,0x10(%rcx) 12756320Sbholler 12766320Sbholler add $0x20,%rcx 12776320Sbholler cmp $0x20,%r8 12786320Sbholler jge L(movdqa11) 12796320Sbholler jmp L(movdqa_epi) 12806320Sbholler 12816320Sbholler .balign 16 12826320SbhollerL(movdqa12): 12836320Sbholler sub $0x20,%r8 12846320Sbholler movdqa 0x10(%rdx),%xmm3 12856320Sbholler movdqa 0x20(%rdx),%xmm0 12866320Sbholler add $0x20,%rdx 12876320Sbholler 12886320Sbholler psrldq $0xc,%xmm1 12896320Sbholler movdqa %xmm3,%xmm2 12906320Sbholler pslldq $0x4,%xmm3 12916320Sbholler por %xmm1,%xmm3 12926320Sbholler 12936320Sbholler psrldq $0xc,%xmm2 12946320Sbholler movdqa %xmm0,%xmm1 12956320Sbholler pslldq $0x4,%xmm0 12966320Sbholler por %xmm2,%xmm0 12976320Sbholler movdqa %xmm3,(%rcx) 12986320Sbholler movdqa %xmm0,0x10(%rcx) 12996320Sbholler 13006320Sbholler add $0x20,%rcx 13016320Sbholler cmp $0x20,%r8 13026320Sbholler jge L(movdqa12) 13036320Sbholler jmp L(movdqa_epi) 13046320Sbholler 13056320Sbholler .balign 16 13066320SbhollerL(movdqa13): 13076320Sbholler sub $0x20,%r8 13086320Sbholler movdqa 0x10(%rdx),%xmm3 13096320Sbholler movdqa 0x20(%rdx),%xmm0 13106320Sbholler add $0x20,%rdx 13116320Sbholler 13126320Sbholler psrldq $0xd,%xmm1 13136320Sbholler movdqa %xmm3,%xmm2 13146320Sbholler pslldq $0x3,%xmm3 13156320Sbholler por %xmm1,%xmm3 13166320Sbholler 13176320Sbholler psrldq $0xd,%xmm2 13186320Sbholler movdqa %xmm0,%xmm1 13196320Sbholler pslldq $0x3,%xmm0 13206320Sbholler por %xmm2,%xmm0 13216320Sbholler movdqa %xmm3,(%rcx) 13226320Sbholler movdqa %xmm0,0x10(%rcx) 13236320Sbholler 13246320Sbholler add $0x20,%rcx 13256320Sbholler cmp $0x20,%r8 13266320Sbholler jge L(movdqa13) 13276320Sbholler jmp L(movdqa_epi) 13286320Sbholler 13296320Sbholler .balign 16 13306320SbhollerL(movdqa14): 13316320Sbholler sub $0x20,%r8 13326320Sbholler movdqa 0x10(%rdx),%xmm3 13336320Sbholler movdqa 0x20(%rdx),%xmm0 13346320Sbholler add $0x20,%rdx 13356320Sbholler 13366320Sbholler psrldq $0xe,%xmm1 13376320Sbholler movdqa %xmm3,%xmm2 13386320Sbholler pslldq $0x2,%xmm3 13396320Sbholler por %xmm1,%xmm3 13406320Sbholler 13416320Sbholler psrldq $0xe,%xmm2 13426320Sbholler movdqa %xmm0,%xmm1 13436320Sbholler pslldq $0x2,%xmm0 13446320Sbholler por %xmm2,%xmm0 13456320Sbholler movdqa %xmm3,(%rcx) 13466320Sbholler movdqa %xmm0,0x10(%rcx) 13476320Sbholler 13486320Sbholler add $0x20,%rcx 13496320Sbholler cmp $0x20,%r8 13506320Sbholler jge L(movdqa14) 13516320Sbholler jmp L(movdqa_epi) 13526320Sbholler 13536320Sbholler .balign 16 13546320SbhollerL(movdqa15): 13556320Sbholler sub $0x20,%r8 13566320Sbholler movdqa 0x10(%rdx),%xmm3 13576320Sbholler movdqa 0x20(%rdx),%xmm0 13586320Sbholler add $0x20,%rdx 13596320Sbholler 13606320Sbholler psrldq $0xf,%xmm1 13616320Sbholler movdqa %xmm3,%xmm2 13626320Sbholler pslldq $0x1,%xmm3 13636320Sbholler por %xmm1,%xmm3 13646320Sbholler 13656320Sbholler psrldq $0xf,%xmm2 13666320Sbholler movdqa %xmm0,%xmm1 13676320Sbholler pslldq $0x1,%xmm0 13686320Sbholler por %xmm2,%xmm0 13696320Sbholler movdqa %xmm3,(%rcx) 13706320Sbholler movdqa %xmm0,0x10(%rcx) 13716320Sbholler 13726320Sbholler add $0x20,%rcx 13736320Sbholler cmp $0x20,%r8 13746320Sbholler jge L(movdqa15) 13756320Sbholler #jmp L(movdqa_epi) 13766320Sbholler 13776320Sbholler .balign 16 13786320SbhollerL(movdqa_epi): 13796320Sbholler lea L(fwdPxQx)(%rip),%r10 13806320Sbholler add %r11,%rdx # bump rdx to the right addr (it lagged behind in the above loop) 13816320Sbholler add %r8,%rcx 13826320Sbholler add %r8,%rdx 13836320Sbholler 13846320Sbholler movslq (%r10,%r8,4),%r9 13856320Sbholler lea (%r9,%r10,1),%r10 13866320Sbholler jmpq *%r10 13876320Sbholler 13886320Sbholler .balign 16 13896320SbhollerL(mov3dqa1): 13906320Sbholler movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 13916320Sbholler sub $0x30,%r8 13926320Sbholler movdqa 0x20(%rdx),%xmm0 # load the upper source buffer 13936320Sbholler movdqa 0x30(%rdx),%xmm5 # load the upper source buffer 13946320Sbholler lea 0x30(%rdx),%rdx 13956320Sbholler cmp $0x30,%r8 13966320Sbholler 13976320Sbholler movdqa %xmm3,%xmm2 # store off xmm reg for use next iteration 13986320Sbholler #palignr $0x1,%xmm1,%xmm3 13996320Sbholler .byte 0x66,0x0f,0x3a,0x0f 14006320Sbholler .byte 0xd9,0x01 14016320Sbholler movdqa %xmm3,(%rcx) # store it 14026320Sbholler 14036320Sbholler movdqa %xmm0,%xmm4 # store off xmm reg for use next iteration 14046320Sbholler #palignr $0x1,%xmm2,%xmm0 14056320Sbholler .byte 0x66,0x0f,0x3a,0x0f 14066320Sbholler .byte 0xc2,0x01 14076320Sbholler movdqa %xmm0,0x10(%rcx) # store it 14086320Sbholler 14096320Sbholler movdqa %xmm5,%xmm1 # store off xmm reg for use next iteration 14106320Sbholler #palignr $0x1,%xmm4,%xmm5 14116320Sbholler .byte 0x66,0x0f,0x3a,0x0f 14126320Sbholler .byte 0xec,0x01 14136320Sbholler movdqa %xmm5,0x20(%rcx) # store it 14146320Sbholler 14156320Sbholler lea 0x30(%rcx),%rcx 14166320Sbholler jge L(mov3dqa1) 14176320Sbholler 14186320Sbholler cmp $0x10,%r8 14196320Sbholler jl L(movdqa_epi) 14206320Sbholler movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 14216320Sbholler sub $0x10,%r8 14226320Sbholler lea 0x10(%rdx),%rdx 14236320Sbholler movdqa %xmm3,%xmm2 # save for use next concat 14246320Sbholler #palignr $0x1,%xmm1,%xmm3 14256320Sbholler .byte 0x66,0x0f,0x3a,0x0f 14266320Sbholler .byte 0xd9,0x01 14276320Sbholler 14286320Sbholler cmp $0x10,%r8 14296320Sbholler movdqa %xmm3,(%rcx) # store it 14306320Sbholler lea 0x10(%rcx),%rcx 14316320Sbholler jl L(movdqa_epi) 14326320Sbholler 14336320Sbholler movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 14346320Sbholler sub $0x10,%r8 14356320Sbholler lea 0x10(%rdx),%rdx 14366320Sbholler #palignr $0x1,%xmm2,%xmm0 14376320Sbholler .byte 0x66,0x0f,0x3a,0x0f 14386320Sbholler .byte 0xc2,0x01 14396320Sbholler movdqa %xmm0,(%rcx) # store it 14406320Sbholler lea 0x10(%rcx),%rcx 14416320Sbholler jmp L(movdqa_epi) 14426320Sbholler 14436320Sbholler .balign 16 14446320SbhollerL(mov3dqa2): 14456320Sbholler movdqa 0x10(%rdx),%xmm3 14466320Sbholler sub $0x30,%r8 14476320Sbholler movdqa 0x20(%rdx),%xmm0 14486320Sbholler movdqa 0x30(%rdx),%xmm5 14496320Sbholler lea 0x30(%rdx),%rdx 14506320Sbholler cmp $0x30,%r8 14516320Sbholler 14526320Sbholler movdqa %xmm3,%xmm2 14536320Sbholler #palignr $0x2,%xmm1,%xmm3 14546320Sbholler .byte 0x66,0x0f,0x3a,0x0f 14556320Sbholler .byte 0xd9,0x02 14566320Sbholler movdqa %xmm3,(%rcx) 14576320Sbholler 14586320Sbholler movdqa %xmm0,%xmm4 14596320Sbholler #palignr $0x2,%xmm2,%xmm0 14606320Sbholler .byte 0x66,0x0f,0x3a,0x0f 14616320Sbholler .byte 0xc2,0x02 14626320Sbholler movdqa %xmm0,0x10(%rcx) 14636320Sbholler 14646320Sbholler movdqa %xmm5,%xmm1 14656320Sbholler #palignr $0x2,%xmm4,%xmm5 14666320Sbholler .byte 0x66,0x0f,0x3a,0x0f 14676320Sbholler .byte 0xec,0x02 14686320Sbholler movdqa %xmm5,0x20(%rcx) 14696320Sbholler 14706320Sbholler lea 0x30(%rcx),%rcx 14716320Sbholler jge L(mov3dqa2) 14726320Sbholler 14736320Sbholler cmp $0x10,%r8 14746320Sbholler jl L(movdqa_epi) 14756320Sbholler movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 14766320Sbholler sub $0x10,%r8 14776320Sbholler lea 0x10(%rdx),%rdx 14786320Sbholler movdqa %xmm3,%xmm2 # save for use next concat 14796320Sbholler #palignr $0x2,%xmm1,%xmm3 14806320Sbholler .byte 0x66,0x0f,0x3a,0x0f 14816320Sbholler .byte 0xd9,0x02 14826320Sbholler 14836320Sbholler cmp $0x10,%r8 14846320Sbholler movdqa %xmm3,(%rcx) # store it 14856320Sbholler lea 0x10(%rcx),%rcx 14866320Sbholler jl L(movdqa_epi) 14876320Sbholler 14886320Sbholler movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 14896320Sbholler sub $0x10,%r8 14906320Sbholler lea 0x10(%rdx),%rdx 14916320Sbholler #palignr $0x2,%xmm2,%xmm0 14926320Sbholler .byte 0x66,0x0f,0x3a,0x0f 14936320Sbholler .byte 0xc2,0x02 14946320Sbholler movdqa %xmm0,(%rcx) # store it 14956320Sbholler lea 0x10(%rcx),%rcx 14966320Sbholler jmp L(movdqa_epi) 14976320Sbholler 14986320Sbholler .balign 16 14996320SbhollerL(mov3dqa3): 15006320Sbholler movdqa 0x10(%rdx),%xmm3 15016320Sbholler sub $0x30,%r8 15026320Sbholler movdqa 0x20(%rdx),%xmm0 15036320Sbholler movdqa 0x30(%rdx),%xmm5 15046320Sbholler lea 0x30(%rdx),%rdx 15056320Sbholler cmp $0x30,%r8 15066320Sbholler 15076320Sbholler movdqa %xmm3,%xmm2 15086320Sbholler #palignr $0x3,%xmm1,%xmm3 15096320Sbholler .byte 0x66,0x0f,0x3a,0x0f 15106320Sbholler .byte 0xd9,0x03 15116320Sbholler movdqa %xmm3,(%rcx) 15126320Sbholler 15136320Sbholler movdqa %xmm0,%xmm4 15146320Sbholler #palignr $0x3,%xmm2,%xmm0 15156320Sbholler .byte 0x66,0x0f,0x3a,0x0f 15166320Sbholler .byte 0xc2,0x03 15176320Sbholler movdqa %xmm0,0x10(%rcx) 15186320Sbholler 15196320Sbholler movdqa %xmm5,%xmm1 15206320Sbholler #palignr $0x3,%xmm4,%xmm5 15216320Sbholler .byte 0x66,0x0f,0x3a,0x0f 15226320Sbholler .byte 0xec,0x03 15236320Sbholler movdqa %xmm5,0x20(%rcx) 15246320Sbholler 15256320Sbholler lea 0x30(%rcx),%rcx 15266320Sbholler jge L(mov3dqa3) 15276320Sbholler 15286320Sbholler cmp $0x10,%r8 15296320Sbholler jl L(movdqa_epi) 15306320Sbholler movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 15316320Sbholler sub $0x10,%r8 15326320Sbholler lea 0x10(%rdx),%rdx 15336320Sbholler movdqa %xmm3,%xmm2 # save for use next concat 15346320Sbholler #palignr $0x3,%xmm1,%xmm3 15356320Sbholler .byte 0x66,0x0f,0x3a,0x0f 15366320Sbholler .byte 0xd9,0x03 15376320Sbholler 15386320Sbholler cmp $0x10,%r8 15396320Sbholler movdqa %xmm3,(%rcx) # store it 15406320Sbholler lea 0x10(%rcx),%rcx 15416320Sbholler jl L(movdqa_epi) 15426320Sbholler 15436320Sbholler movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 15446320Sbholler sub $0x10,%r8 15456320Sbholler lea 0x10(%rdx),%rdx 15466320Sbholler #palignr $0x3,%xmm2,%xmm0 15476320Sbholler .byte 0x66,0x0f,0x3a,0x0f 15486320Sbholler .byte 0xc2,0x03 15496320Sbholler movdqa %xmm0,(%rcx) # store it 15506320Sbholler lea 0x10(%rcx),%rcx 15516320Sbholler jmp L(movdqa_epi) 15526320Sbholler 15536320Sbholler .balign 16 15546320SbhollerL(mov3dqa4): 15556320Sbholler movdqa 0x10(%rdx),%xmm3 15566320Sbholler sub $0x30,%r8 15576320Sbholler movdqa 0x20(%rdx),%xmm0 15586320Sbholler movdqa 0x30(%rdx),%xmm5 15596320Sbholler lea 0x30(%rdx),%rdx 15606320Sbholler cmp $0x30,%r8 15616320Sbholler 15626320Sbholler movdqa %xmm3,%xmm2 15636320Sbholler #palignr $0x4,%xmm1,%xmm3 15646320Sbholler .byte 0x66,0x0f,0x3a,0x0f 15656320Sbholler .byte 0xd9,0x04 15666320Sbholler movdqa %xmm3,(%rcx) 15676320Sbholler 15686320Sbholler movdqa %xmm0,%xmm4 15696320Sbholler #palignr $0x4,%xmm2,%xmm0 15706320Sbholler .byte 0x66,0x0f,0x3a,0x0f 15716320Sbholler .byte 0xc2,0x04 15726320Sbholler movdqa %xmm0,0x10(%rcx) 15736320Sbholler 15746320Sbholler movdqa %xmm5,%xmm1 15756320Sbholler #palignr $0x4,%xmm4,%xmm5 15766320Sbholler .byte 0x66,0x0f,0x3a,0x0f 15776320Sbholler .byte 0xec,0x04 15786320Sbholler movdqa %xmm5,0x20(%rcx) 15796320Sbholler 15806320Sbholler lea 0x30(%rcx),%rcx 15816320Sbholler jge L(mov3dqa4) 15826320Sbholler 15836320Sbholler cmp $0x10,%r8 15846320Sbholler jl L(movdqa_epi) 15856320Sbholler movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 15866320Sbholler sub $0x10,%r8 15876320Sbholler lea 0x10(%rdx),%rdx 15886320Sbholler movdqa %xmm3,%xmm2 # save for use next concat 15896320Sbholler #palignr $0x4,%xmm1,%xmm3 15906320Sbholler .byte 0x66,0x0f,0x3a,0x0f 15916320Sbholler .byte 0xd9,0x04 15926320Sbholler 15936320Sbholler cmp $0x10,%r8 15946320Sbholler movdqa %xmm3,(%rcx) # store it 15956320Sbholler lea 0x10(%rcx),%rcx 15966320Sbholler jl L(movdqa_epi) 15976320Sbholler 15986320Sbholler movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 15996320Sbholler sub $0x10,%r8 16006320Sbholler lea 0x10(%rdx),%rdx 16016320Sbholler #palignr $0x4,%xmm2,%xmm0 16026320Sbholler .byte 0x66,0x0f,0x3a,0x0f 16036320Sbholler .byte 0xc2,0x04 16046320Sbholler movdqa %xmm0,(%rcx) # store it 16056320Sbholler lea 0x10(%rcx),%rcx 16066320Sbholler jmp L(movdqa_epi) 16076320Sbholler 16086320Sbholler .balign 16 16096320SbhollerL(mov3dqa5): 16106320Sbholler movdqa 0x10(%rdx),%xmm3 16116320Sbholler sub $0x30,%r8 16126320Sbholler movdqa 0x20(%rdx),%xmm0 16136320Sbholler movdqa 0x30(%rdx),%xmm5 16146320Sbholler lea 0x30(%rdx),%rdx 16156320Sbholler cmp $0x30,%r8 16166320Sbholler 16176320Sbholler movdqa %xmm3,%xmm2 16186320Sbholler #palignr $0x5,%xmm1,%xmm3 16196320Sbholler .byte 0x66,0x0f,0x3a,0x0f 16206320Sbholler .byte 0xd9,0x05 16216320Sbholler movdqa %xmm3,(%rcx) 16226320Sbholler 16236320Sbholler movdqa %xmm0,%xmm4 16246320Sbholler #palignr $0x5,%xmm2,%xmm0 16256320Sbholler .byte 0x66,0x0f,0x3a,0x0f 16266320Sbholler .byte 0xc2,0x05 16276320Sbholler movdqa %xmm0,0x10(%rcx) 16286320Sbholler 16296320Sbholler movdqa %xmm5,%xmm1 16306320Sbholler #palignr $0x5,%xmm4,%xmm5 16316320Sbholler .byte 0x66,0x0f,0x3a,0x0f 16326320Sbholler .byte 0xec,0x05 16336320Sbholler movdqa %xmm5,0x20(%rcx) 16346320Sbholler 16356320Sbholler lea 0x30(%rcx),%rcx 16366320Sbholler jge L(mov3dqa5) 16376320Sbholler 16386320Sbholler cmp $0x10,%r8 16396320Sbholler jl L(movdqa_epi) 16406320Sbholler movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 16416320Sbholler sub $0x10,%r8 16426320Sbholler lea 0x10(%rdx),%rdx 16436320Sbholler movdqa %xmm3,%xmm2 # save for use next concat 16446320Sbholler #palignr $0x5,%xmm1,%xmm3 16456320Sbholler .byte 0x66,0x0f,0x3a,0x0f 16466320Sbholler .byte 0xd9,0x05 16476320Sbholler 16486320Sbholler cmp $0x10,%r8 16496320Sbholler movdqa %xmm3,(%rcx) # store it 16506320Sbholler lea 0x10(%rcx),%rcx 16516320Sbholler jl L(movdqa_epi) 16526320Sbholler 16536320Sbholler movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 16546320Sbholler sub $0x10,%r8 16556320Sbholler lea 0x10(%rdx),%rdx 16566320Sbholler #palignr $0x5,%xmm2,%xmm0 16576320Sbholler .byte 0x66,0x0f,0x3a,0x0f 16586320Sbholler .byte 0xc2,0x05 16596320Sbholler movdqa %xmm0,(%rcx) # store it 16606320Sbholler lea 0x10(%rcx),%rcx 16616320Sbholler jmp L(movdqa_epi) 16626320Sbholler 16636320Sbholler .balign 16 16646320SbhollerL(mov3dqa6): 16656320Sbholler movdqa 0x10(%rdx),%xmm3 16666320Sbholler sub $0x30,%r8 16676320Sbholler movdqa 0x20(%rdx),%xmm0 16686320Sbholler movdqa 0x30(%rdx),%xmm5 16696320Sbholler lea 0x30(%rdx),%rdx 16706320Sbholler cmp $0x30,%r8 16716320Sbholler 16726320Sbholler movdqa %xmm3,%xmm2 16736320Sbholler #palignr $0x6,%xmm1,%xmm3 16746320Sbholler .byte 0x66,0x0f,0x3a,0x0f 16756320Sbholler .byte 0xd9,0x06 16766320Sbholler movdqa %xmm3,(%rcx) 16776320Sbholler 16786320Sbholler movdqa %xmm0,%xmm4 16796320Sbholler #palignr $0x6,%xmm2,%xmm0 16806320Sbholler .byte 0x66,0x0f,0x3a,0x0f 16816320Sbholler .byte 0xc2,0x06 16826320Sbholler movdqa %xmm0,0x10(%rcx) 16836320Sbholler 16846320Sbholler movdqa %xmm5,%xmm1 16856320Sbholler #palignr $0x6,%xmm4,%xmm5 16866320Sbholler .byte 0x66,0x0f,0x3a,0x0f 16876320Sbholler .byte 0xec,0x06 16886320Sbholler movdqa %xmm5,0x20(%rcx) 16896320Sbholler 16906320Sbholler lea 0x30(%rcx),%rcx 16916320Sbholler jge L(mov3dqa6) 16926320Sbholler 16936320Sbholler cmp $0x10,%r8 16946320Sbholler jl L(movdqa_epi) 16956320Sbholler movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 16966320Sbholler sub $0x10,%r8 16976320Sbholler lea 0x10(%rdx),%rdx 16986320Sbholler movdqa %xmm3,%xmm2 # save for use next concat 16996320Sbholler #palignr $0x6,%xmm1,%xmm3 17006320Sbholler .byte 0x66,0x0f,0x3a,0x0f 17016320Sbholler .byte 0xd9,0x06 17026320Sbholler 17036320Sbholler cmp $0x10,%r8 17046320Sbholler movdqa %xmm3,(%rcx) # store it 17056320Sbholler lea 0x10(%rcx),%rcx 17066320Sbholler jl L(movdqa_epi) 17076320Sbholler 17086320Sbholler movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 17096320Sbholler sub $0x10,%r8 17106320Sbholler lea 0x10(%rdx),%rdx 17116320Sbholler #palignr $0x6,%xmm2,%xmm0 17126320Sbholler .byte 0x66,0x0f,0x3a,0x0f 17136320Sbholler .byte 0xc2,0x06 17146320Sbholler movdqa %xmm0,(%rcx) # store it 17156320Sbholler lea 0x10(%rcx),%rcx 17166320Sbholler jmp L(movdqa_epi) 17176320Sbholler 17186320Sbholler .balign 16 17196320SbhollerL(mov3dqa7): 17206320Sbholler movdqa 0x10(%rdx),%xmm3 17216320Sbholler sub $0x30,%r8 17226320Sbholler movdqa 0x20(%rdx),%xmm0 17236320Sbholler movdqa 0x30(%rdx),%xmm5 17246320Sbholler lea 0x30(%rdx),%rdx 17256320Sbholler cmp $0x30,%r8 17266320Sbholler 17276320Sbholler movdqa %xmm3,%xmm2 17286320Sbholler #palignr $0x7,%xmm1,%xmm3 17296320Sbholler .byte 0x66,0x0f,0x3a,0x0f 17306320Sbholler .byte 0xd9,0x07 17316320Sbholler movdqa %xmm3,(%rcx) 17326320Sbholler 17336320Sbholler movdqa %xmm0,%xmm4 17346320Sbholler #palignr $0x7,%xmm2,%xmm0 17356320Sbholler .byte 0x66,0x0f,0x3a,0x0f 17366320Sbholler .byte 0xc2,0x07 17376320Sbholler movdqa %xmm0,0x10(%rcx) 17386320Sbholler 17396320Sbholler movdqa %xmm5,%xmm1 17406320Sbholler #palignr $0x7,%xmm4,%xmm5 17416320Sbholler .byte 0x66,0x0f,0x3a,0x0f 17426320Sbholler .byte 0xec,0x07 17436320Sbholler movdqa %xmm5,0x20(%rcx) 17446320Sbholler 17456320Sbholler lea 0x30(%rcx),%rcx 17466320Sbholler jge L(mov3dqa7) 17476320Sbholler 17486320Sbholler cmp $0x10,%r8 17496320Sbholler jl L(movdqa_epi) 17506320Sbholler movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 17516320Sbholler sub $0x10,%r8 17526320Sbholler lea 0x10(%rdx),%rdx 17536320Sbholler movdqa %xmm3,%xmm2 # save for use next concat 17546320Sbholler #palignr $0x7,%xmm1,%xmm3 17556320Sbholler .byte 0x66,0x0f,0x3a,0x0f 17566320Sbholler .byte 0xd9,0x07 17576320Sbholler 17586320Sbholler cmp $0x10,%r8 17596320Sbholler movdqa %xmm3,(%rcx) # store it 17606320Sbholler lea 0x10(%rcx),%rcx 17616320Sbholler jl L(movdqa_epi) 17626320Sbholler 17636320Sbholler movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 17646320Sbholler sub $0x10,%r8 17656320Sbholler lea 0x10(%rdx),%rdx 17666320Sbholler #palignr $0x7,%xmm2,%xmm0 17676320Sbholler .byte 0x66,0x0f,0x3a,0x0f 17686320Sbholler .byte 0xc2,0x07 17696320Sbholler movdqa %xmm0,(%rcx) # store it 17706320Sbholler lea 0x10(%rcx),%rcx 17716320Sbholler jmp L(movdqa_epi) 17726320Sbholler 17736320Sbholler .balign 16 17746320SbhollerL(mov3dqa9): 17756320Sbholler movdqa 0x10(%rdx),%xmm3 17766320Sbholler sub $0x30,%r8 17776320Sbholler movdqa 0x20(%rdx),%xmm0 17786320Sbholler movdqa 0x30(%rdx),%xmm5 17796320Sbholler lea 0x30(%rdx),%rdx 17806320Sbholler cmp $0x30,%r8 17816320Sbholler 17826320Sbholler movdqa %xmm3,%xmm2 17836320Sbholler #palignr $0x9,%xmm1,%xmm3 17846320Sbholler .byte 0x66,0x0f,0x3a,0x0f 17856320Sbholler .byte 0xd9,0x09 17866320Sbholler movdqa %xmm3,(%rcx) 17876320Sbholler 17886320Sbholler movdqa %xmm0,%xmm4 17896320Sbholler #palignr $0x9,%xmm2,%xmm0 17906320Sbholler .byte 0x66,0x0f,0x3a,0x0f 17916320Sbholler .byte 0xc2,0x09 17926320Sbholler movdqa %xmm0,0x10(%rcx) 17936320Sbholler 17946320Sbholler movdqa %xmm5,%xmm1 17956320Sbholler #palignr $0x9,%xmm4,%xmm5 17966320Sbholler .byte 0x66,0x0f,0x3a,0x0f 17976320Sbholler .byte 0xec,0x09 17986320Sbholler movdqa %xmm5,0x20(%rcx) 17996320Sbholler 18006320Sbholler lea 0x30(%rcx),%rcx 18016320Sbholler jge L(mov3dqa9) 18026320Sbholler 18036320Sbholler cmp $0x10,%r8 18046320Sbholler jl L(movdqa_epi) 18056320Sbholler movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 18066320Sbholler sub $0x10,%r8 18076320Sbholler lea 0x10(%rdx),%rdx 18086320Sbholler movdqa %xmm3,%xmm2 # save for use next concat 18096320Sbholler #palignr $0x9,%xmm1,%xmm3 18106320Sbholler .byte 0x66,0x0f,0x3a,0x0f 18116320Sbholler .byte 0xd9,0x09 18126320Sbholler 18136320Sbholler cmp $0x10,%r8 18146320Sbholler movdqa %xmm3,(%rcx) # store it 18156320Sbholler lea 0x10(%rcx),%rcx 18166320Sbholler jl L(movdqa_epi) 18176320Sbholler 18186320Sbholler movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 18196320Sbholler sub $0x10,%r8 18206320Sbholler lea 0x10(%rdx),%rdx 18216320Sbholler #palignr $0x9,%xmm2,%xmm0 18226320Sbholler .byte 0x66,0x0f,0x3a,0x0f 18236320Sbholler .byte 0xc2,0x09 18246320Sbholler movdqa %xmm0,(%rcx) # store it 18256320Sbholler lea 0x10(%rcx),%rcx 18266320Sbholler jmp L(movdqa_epi) 18276320Sbholler 18286320Sbholler .balign 16 18296320SbhollerL(mov3dqa10): 18306320Sbholler movdqa 0x10(%rdx),%xmm3 18316320Sbholler sub $0x30,%r8 18326320Sbholler movdqa 0x20(%rdx),%xmm0 18336320Sbholler movdqa 0x30(%rdx),%xmm5 18346320Sbholler lea 0x30(%rdx),%rdx 18356320Sbholler cmp $0x30,%r8 18366320Sbholler 18376320Sbholler movdqa %xmm3,%xmm2 18386320Sbholler #palignr $0xa,%xmm1,%xmm3 18396320Sbholler .byte 0x66,0x0f,0x3a,0x0f 18406320Sbholler .byte 0xd9,0x0a 18416320Sbholler movdqa %xmm3,(%rcx) 18426320Sbholler 18436320Sbholler movdqa %xmm0,%xmm4 18446320Sbholler #palignr $0xa,%xmm2,%xmm0 18456320Sbholler .byte 0x66,0x0f,0x3a,0x0f 18466320Sbholler .byte 0xc2,0x0a 18476320Sbholler movdqa %xmm0,0x10(%rcx) 18486320Sbholler 18496320Sbholler movdqa %xmm5,%xmm1 18506320Sbholler #palignr $0xa,%xmm4,%xmm5 18516320Sbholler .byte 0x66,0x0f,0x3a,0x0f 18526320Sbholler .byte 0xec,0x0a 18536320Sbholler movdqa %xmm5,0x20(%rcx) 18546320Sbholler 18556320Sbholler lea 0x30(%rcx),%rcx 18566320Sbholler jge L(mov3dqa10) 18576320Sbholler 18586320Sbholler cmp $0x10,%r8 18596320Sbholler jl L(movdqa_epi) 18606320Sbholler movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 18616320Sbholler sub $0x10,%r8 18626320Sbholler lea 0x10(%rdx),%rdx 18636320Sbholler movdqa %xmm3,%xmm2 # save for use next concat 18646320Sbholler #palignr $0xa,%xmm1,%xmm3 18656320Sbholler .byte 0x66,0x0f,0x3a,0x0f 18666320Sbholler .byte 0xd9,0x0a 18676320Sbholler 18686320Sbholler cmp $0x10,%r8 18696320Sbholler movdqa %xmm3,(%rcx) # store it 18706320Sbholler lea 0x10(%rcx),%rcx 18716320Sbholler jl L(movdqa_epi) 18726320Sbholler 18736320Sbholler movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 18746320Sbholler sub $0x10,%r8 18756320Sbholler lea 0x10(%rdx),%rdx 18766320Sbholler #palignr $0xa,%xmm2,%xmm0 18776320Sbholler .byte 0x66,0x0f,0x3a,0x0f 18786320Sbholler .byte 0xc2,0x0a 18796320Sbholler movdqa %xmm0,(%rcx) # store it 18806320Sbholler lea 0x10(%rcx),%rcx 18816320Sbholler jmp L(movdqa_epi) 18826320Sbholler 18836320Sbholler .balign 16 18846320SbhollerL(mov3dqa11): 18856320Sbholler movdqa 0x10(%rdx),%xmm3 18866320Sbholler sub $0x30,%r8 18876320Sbholler movdqa 0x20(%rdx),%xmm0 18886320Sbholler movdqa 0x30(%rdx),%xmm5 18896320Sbholler lea 0x30(%rdx),%rdx 18906320Sbholler cmp $0x30,%r8 18916320Sbholler 18926320Sbholler movdqa %xmm3,%xmm2 18936320Sbholler #palignr $0xb,%xmm1,%xmm3 18946320Sbholler .byte 0x66,0x0f,0x3a,0x0f 18956320Sbholler .byte 0xd9,0x0b 18966320Sbholler movdqa %xmm3,(%rcx) 18976320Sbholler 18986320Sbholler movdqa %xmm0,%xmm4 18996320Sbholler #palignr $0xb,%xmm2,%xmm0 19006320Sbholler .byte 0x66,0x0f,0x3a,0x0f 19016320Sbholler .byte 0xc2,0x0b 19026320Sbholler movdqa %xmm0,0x10(%rcx) 19036320Sbholler 19046320Sbholler movdqa %xmm5,%xmm1 19056320Sbholler #palignr $0xb,%xmm4,%xmm5 19066320Sbholler .byte 0x66,0x0f,0x3a,0x0f 19076320Sbholler .byte 0xec,0x0b 19086320Sbholler movdqa %xmm5,0x20(%rcx) 19096320Sbholler 19106320Sbholler lea 0x30(%rcx),%rcx 19116320Sbholler jge L(mov3dqa11) 19126320Sbholler 19136320Sbholler cmp $0x10,%r8 19146320Sbholler jl L(movdqa_epi) 19156320Sbholler movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 19166320Sbholler sub $0x10,%r8 19176320Sbholler lea 0x10(%rdx),%rdx 19186320Sbholler movdqa %xmm3,%xmm2 # save for use next concat 19196320Sbholler #palignr $0xb,%xmm1,%xmm3 19206320Sbholler .byte 0x66,0x0f,0x3a,0x0f 19216320Sbholler .byte 0xd9,0x0b 19226320Sbholler 19236320Sbholler cmp $0x10,%r8 19246320Sbholler movdqa %xmm3,(%rcx) # store it 19256320Sbholler lea 0x10(%rcx),%rcx 19266320Sbholler jl L(movdqa_epi) 19276320Sbholler 19286320Sbholler movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 19296320Sbholler sub $0x10,%r8 19306320Sbholler lea 0x10(%rdx),%rdx 19316320Sbholler #palignr $0xb,%xmm2,%xmm0 19326320Sbholler .byte 0x66,0x0f,0x3a,0x0f 19336320Sbholler .byte 0xc2,0x0b 19346320Sbholler movdqa %xmm0,(%rcx) # store it 19356320Sbholler lea 0x10(%rcx),%rcx 19366320Sbholler jmp L(movdqa_epi) 19376320Sbholler 19386320Sbholler .balign 16 19396320SbhollerL(mov3dqa12): 19406320Sbholler movdqa 0x10(%rdx),%xmm3 19416320Sbholler sub $0x30,%r8 19426320Sbholler movdqa 0x20(%rdx),%xmm0 19436320Sbholler movdqa 0x30(%rdx),%xmm5 19446320Sbholler lea 0x30(%rdx),%rdx 19456320Sbholler cmp $0x30,%r8 19466320Sbholler 19476320Sbholler movdqa %xmm3,%xmm2 19486320Sbholler #palignr $0xc,%xmm1,%xmm3 19496320Sbholler .byte 0x66,0x0f,0x3a,0x0f 19506320Sbholler .byte 0xd9,0x0c 19516320Sbholler movdqa %xmm3,(%rcx) 19526320Sbholler 19536320Sbholler movdqa %xmm0,%xmm4 19546320Sbholler #palignr $0xc,%xmm2,%xmm0 19556320Sbholler .byte 0x66,0x0f,0x3a,0x0f 19566320Sbholler .byte 0xc2,0x0c 19576320Sbholler movdqa %xmm0,0x10(%rcx) 19586320Sbholler 19596320Sbholler movdqa %xmm5,%xmm1 19606320Sbholler #palignr $0xc,%xmm4,%xmm5 19616320Sbholler .byte 0x66,0x0f,0x3a,0x0f 19626320Sbholler .byte 0xec,0x0c 19636320Sbholler movdqa %xmm5,0x20(%rcx) 19646320Sbholler 19656320Sbholler lea 0x30(%rcx),%rcx 19666320Sbholler jge L(mov3dqa12) 19676320Sbholler 19686320Sbholler cmp $0x10,%r8 19696320Sbholler jl L(movdqa_epi) 19706320Sbholler movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 19716320Sbholler sub $0x10,%r8 19726320Sbholler lea 0x10(%rdx),%rdx 19736320Sbholler movdqa %xmm3,%xmm2 # save for use next concat 19746320Sbholler #palignr $0xc,%xmm1,%xmm3 19756320Sbholler .byte 0x66,0x0f,0x3a,0x0f 19766320Sbholler .byte 0xd9,0x0c 19776320Sbholler 19786320Sbholler cmp $0x10,%r8 19796320Sbholler movdqa %xmm3,(%rcx) # store it 19806320Sbholler lea 0x10(%rcx),%rcx 19816320Sbholler jl L(movdqa_epi) 19826320Sbholler 19836320Sbholler movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 19846320Sbholler sub $0x10,%r8 19856320Sbholler lea 0x10(%rdx),%rdx 19866320Sbholler #palignr $0xc,%xmm2,%xmm0 19876320Sbholler .byte 0x66,0x0f,0x3a,0x0f 19886320Sbholler .byte 0xc2,0x0c 19896320Sbholler movdqa %xmm0,(%rcx) # store it 19906320Sbholler lea 0x10(%rcx),%rcx 19916320Sbholler jmp L(movdqa_epi) 19926320Sbholler 19936320Sbholler .balign 16 19946320SbhollerL(mov3dqa13): 19956320Sbholler movdqa 0x10(%rdx),%xmm3 19966320Sbholler sub $0x30,%r8 19976320Sbholler movdqa 0x20(%rdx),%xmm0 19986320Sbholler movdqa 0x30(%rdx),%xmm5 19996320Sbholler lea 0x30(%rdx),%rdx 20006320Sbholler cmp $0x30,%r8 20016320Sbholler 20026320Sbholler movdqa %xmm3,%xmm2 20036320Sbholler #palignr $0xd,%xmm1,%xmm3 20046320Sbholler .byte 0x66,0x0f,0x3a,0x0f 20056320Sbholler .byte 0xd9,0x0d 20066320Sbholler movdqa %xmm3,(%rcx) 20076320Sbholler 20086320Sbholler movdqa %xmm0,%xmm4 20096320Sbholler #palignr $0xd,%xmm2,%xmm0 20106320Sbholler .byte 0x66,0x0f,0x3a,0x0f 20116320Sbholler .byte 0xc2,0x0d 20126320Sbholler movdqa %xmm0,0x10(%rcx) 20136320Sbholler 20146320Sbholler movdqa %xmm5,%xmm1 20156320Sbholler #palignr $0xd,%xmm4,%xmm5 20166320Sbholler .byte 0x66,0x0f,0x3a,0x0f 20176320Sbholler .byte 0xec,0x0d 20186320Sbholler movdqa %xmm5,0x20(%rcx) 20196320Sbholler 20206320Sbholler lea 0x30(%rcx),%rcx 20216320Sbholler jge L(mov3dqa13) 20226320Sbholler 20236320Sbholler cmp $0x10,%r8 20246320Sbholler jl L(movdqa_epi) 20256320Sbholler movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 20266320Sbholler sub $0x10,%r8 20276320Sbholler lea 0x10(%rdx),%rdx 20286320Sbholler movdqa %xmm3,%xmm2 # save for use next concat 20296320Sbholler #palignr $0xd,%xmm1,%xmm3 20306320Sbholler .byte 0x66,0x0f,0x3a,0x0f 20316320Sbholler .byte 0xd9,0x0d 20326320Sbholler 20336320Sbholler cmp $0x10,%r8 20346320Sbholler movdqa %xmm3,(%rcx) # store it 20356320Sbholler lea 0x10(%rcx),%rcx 20366320Sbholler jl L(movdqa_epi) 20376320Sbholler 20386320Sbholler movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 20396320Sbholler sub $0x10,%r8 20406320Sbholler lea 0x10(%rdx),%rdx 20416320Sbholler #palignr $0xd,%xmm2,%xmm0 20426320Sbholler .byte 0x66,0x0f,0x3a,0x0f 20436320Sbholler .byte 0xc2,0x0d 20446320Sbholler movdqa %xmm0,(%rcx) # store it 20456320Sbholler lea 0x10(%rcx),%rcx 20466320Sbholler jmp L(movdqa_epi) 20476320Sbholler 20486320Sbholler .balign 16 20496320SbhollerL(mov3dqa14): 20506320Sbholler movdqa 0x10(%rdx),%xmm3 20516320Sbholler sub $0x30,%r8 20526320Sbholler movdqa 0x20(%rdx),%xmm0 20536320Sbholler movdqa 0x30(%rdx),%xmm5 20546320Sbholler lea 0x30(%rdx),%rdx 20556320Sbholler cmp $0x30,%r8 20566320Sbholler 20576320Sbholler movdqa %xmm3,%xmm2 20586320Sbholler #palignr $0xe,%xmm1,%xmm3 20596320Sbholler .byte 0x66,0x0f,0x3a,0x0f 20606320Sbholler .byte 0xd9,0x0e 20616320Sbholler movdqa %xmm3,(%rcx) 20626320Sbholler 20636320Sbholler movdqa %xmm0,%xmm4 20646320Sbholler #palignr $0xe,%xmm2,%xmm0 20656320Sbholler .byte 0x66,0x0f,0x3a,0x0f 20666320Sbholler .byte 0xc2,0x0e 20676320Sbholler movdqa %xmm0,0x10(%rcx) 20686320Sbholler 20696320Sbholler movdqa %xmm5,%xmm1 20706320Sbholler #palignr $0xe,%xmm4,%xmm5 20716320Sbholler .byte 0x66,0x0f,0x3a,0x0f 20726320Sbholler .byte 0xec,0x0e 20736320Sbholler movdqa %xmm5,0x20(%rcx) 20746320Sbholler 20756320Sbholler lea 0x30(%rcx),%rcx 20766320Sbholler jge L(mov3dqa14) 20776320Sbholler 20786320Sbholler cmp $0x10,%r8 20796320Sbholler jl L(movdqa_epi) 20806320Sbholler movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 20816320Sbholler sub $0x10,%r8 20826320Sbholler lea 0x10(%rdx),%rdx 20836320Sbholler movdqa %xmm3,%xmm2 # save for use next concat 20846320Sbholler #palignr $0xe,%xmm1,%xmm3 20856320Sbholler .byte 0x66,0x0f,0x3a,0x0f 20866320Sbholler .byte 0xd9,0x0e 20876320Sbholler 20886320Sbholler cmp $0x10,%r8 20896320Sbholler movdqa %xmm3,(%rcx) # store it 20906320Sbholler lea 0x10(%rcx),%rcx 20916320Sbholler jl L(movdqa_epi) 20926320Sbholler 20936320Sbholler movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 20946320Sbholler sub $0x10,%r8 20956320Sbholler lea 0x10(%rdx),%rdx 20966320Sbholler #palignr $0xe,%xmm2,%xmm0 20976320Sbholler .byte 0x66,0x0f,0x3a,0x0f 20986320Sbholler .byte 0xc2,0x0e 20996320Sbholler movdqa %xmm0,(%rcx) # store it 21006320Sbholler lea 0x10(%rcx),%rcx 21016320Sbholler jmp L(movdqa_epi) 21026320Sbholler 21036320Sbholler .balign 16 21046320SbhollerL(mov3dqa15): 21056320Sbholler movdqa 0x10(%rdx),%xmm3 21066320Sbholler sub $0x30,%r8 21076320Sbholler movdqa 0x20(%rdx),%xmm0 21086320Sbholler movdqa 0x30(%rdx),%xmm5 21096320Sbholler lea 0x30(%rdx),%rdx 21106320Sbholler cmp $0x30,%r8 21116320Sbholler 21126320Sbholler movdqa %xmm3,%xmm2 21136320Sbholler #palignr $0xf,%xmm1,%xmm3 21146320Sbholler .byte 0x66,0x0f,0x3a,0x0f 21156320Sbholler .byte 0xd9,0x0f 21166320Sbholler movdqa %xmm3,(%rcx) 21176320Sbholler 21186320Sbholler movdqa %xmm0,%xmm4 21196320Sbholler #palignr $0xf,%xmm2,%xmm0 21206320Sbholler .byte 0x66,0x0f,0x3a,0x0f 21216320Sbholler .byte 0xc2,0x0f 21226320Sbholler movdqa %xmm0,0x10(%rcx) 21236320Sbholler 21246320Sbholler movdqa %xmm5,%xmm1 21256320Sbholler #palignr $0xf,%xmm4,%xmm5 21266320Sbholler .byte 0x66,0x0f,0x3a,0x0f 21276320Sbholler .byte 0xec,0x0f 21286320Sbholler movdqa %xmm5,0x20(%rcx) 21296320Sbholler 21306320Sbholler lea 0x30(%rcx),%rcx 21316320Sbholler jge L(mov3dqa15) 21326320Sbholler 21336320Sbholler cmp $0x10,%r8 21346320Sbholler jl L(movdqa_epi) 21356320Sbholler movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 21366320Sbholler sub $0x10,%r8 21376320Sbholler lea 0x10(%rdx),%rdx 21386320Sbholler movdqa %xmm3,%xmm2 # save for use next concat 21396320Sbholler #palignr $0xf,%xmm1,%xmm3 21406320Sbholler .byte 0x66,0x0f,0x3a,0x0f 21416320Sbholler .byte 0xd9,0x0f 21426320Sbholler 21436320Sbholler cmp $0x10,%r8 21446320Sbholler movdqa %xmm3,(%rcx) # store it 21456320Sbholler lea 0x10(%rcx),%rcx 21466320Sbholler jl L(movdqa_epi) 21476320Sbholler 21486320Sbholler movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 21496320Sbholler sub $0x10,%r8 21506320Sbholler lea 0x10(%rdx),%rdx 21516320Sbholler #palignr $0xf,%xmm2,%xmm0 21526320Sbholler .byte 0x66,0x0f,0x3a,0x0f 21536320Sbholler .byte 0xc2,0x0f 21546320Sbholler movdqa %xmm0,(%rcx) # store it 21556320Sbholler lea 0x10(%rcx),%rcx 21566320Sbholler jmp L(movdqa_epi) 21576320Sbholler 21586320Sbholler .balign 16 21596320SbhollerL(sse2_nt_move): 21606320Sbholler lea 0x40(%rcx),%rcx 21616320Sbholler lea 0x40(%rdx),%rdx 21626320Sbholler lea -0x40(%r8),%r8 21636320Sbholler 21646320Sbholler /* 21656320Sbholler * doesn't matter if source is aligned for stuff out of cache. 21666320Sbholler * the mis-aligned penalty is masked by the slowness of main memory. 21676320Sbholler */ 21686320Sbholler prefetchnta 0x180(%rdx) 21696320Sbholler movdqu -0x40(%rdx),%xmm0 21706320Sbholler movdqu -0x30(%rdx),%xmm1 21716320Sbholler 21726320Sbholler cmp $0x40,%r8 21736320Sbholler movntdq %xmm0,-0x40(%rcx) 21746320Sbholler movntdq %xmm1,-0x30(%rcx) 21756320Sbholler 21766320Sbholler movdqu -0x20(%rdx),%xmm2 21776320Sbholler movdqu -0x10(%rdx),%xmm3 21786320Sbholler 21796320Sbholler movntdq %xmm2,-0x20(%rcx) 21806320Sbholler movntdq %xmm3,-0x10(%rcx) 21816320Sbholler 21826320Sbholler jge L(sse2_nt_move) 21836320Sbholler 21846320Sbholler lea L(Fix16EndTable)(%rip),%r10 21856320Sbholler mov %r8,%r9 21866320Sbholler and $0xFFFFFFFFFFFFFFF0,%r9 21876320Sbholler add %r9,%rcx 21886320Sbholler add %r9,%rdx 21896320Sbholler sub %r9,%r8 21906320Sbholler shr $0x4,%r9 21916320Sbholler sfence 21926320Sbholler 21936320Sbholler movslq (%r10,%r9,4),%r11 21946320Sbholler lea (%r11,%r10,1),%r10 21956320Sbholler jmpq *%r10 21966320Sbholler 21976320Sbholler .balign 16 21986320SbhollerL(Fix16EndTable): 21996320Sbholler .int L(fix16_0)-L(Fix16EndTable) 22006320Sbholler .int L(fix16_1)-L(Fix16EndTable) 22016320Sbholler .int L(fix16_2)-L(Fix16EndTable) 22026320Sbholler .int L(fix16_3)-L(Fix16EndTable) 22036320Sbholler 22046320Sbholler .balign 16 22056320SbhollerL(fix16_3): 22066320Sbholler movdqu -0x30(%rdx),%xmm1 22076320Sbholler movdqa %xmm1,-0x30(%rcx) 22086320SbhollerL(fix16_2): 22096320Sbholler movdqu -0x20(%rdx),%xmm2 22106320Sbholler movdqa %xmm2,-0x20(%rcx) 22116320SbhollerL(fix16_1): 22126320Sbholler movdqu -0x10(%rdx),%xmm3 22136320Sbholler movdqa %xmm3,-0x10(%rcx) 22146320SbhollerL(fix16_0): 22156320Sbholler lea L(fwdPxQx)(%rip),%r10 22166320Sbholler add %r8,%rdx 22176320Sbholler add %r8,%rcx 22186320Sbholler 22196320Sbholler movslq (%r10,%r8,4),%r9 22206320Sbholler lea (%r9,%r10,1),%r10 22216320Sbholler jmpq *%r10 22226320Sbholler 22236320Sbholler .balign 16 22246320SbhollerL(pre_both_aligned): 22256320Sbholler cmp $0x80,%r8 22266320Sbholler jl L(fix_16b) 22276320Sbholler 22286320Sbholler .balign 16 22296320SbhollerL(both_aligned): 22306320Sbholler 22316320Sbholler /* 22326320Sbholler * this 'paired' load/load/store/store seems to do best. 22336320Sbholler */ 22346320Sbholler movdqa (%rdx),%xmm0 22356320Sbholler movdqa 0x10(%rdx),%xmm1 22366320Sbholler 22376320Sbholler movdqa %xmm0,(%rcx) 22386320Sbholler movdqa %xmm1,0x10(%rcx) 22396320Sbholler lea -0x80(%r8),%r8 22406320Sbholler 22416320Sbholler movdqa 0x20(%rdx),%xmm2 22426320Sbholler movdqa 0x30(%rdx),%xmm3 22436320Sbholler 22446320Sbholler movdqa %xmm2,0x20(%rcx) 22456320Sbholler movdqa %xmm3,0x30(%rcx) 22466320Sbholler 22476320Sbholler movdqa 0x40(%rdx),%xmm0 22486320Sbholler movdqa 0x50(%rdx),%xmm1 22496320Sbholler cmp $0x80,%r8 22506320Sbholler 22516320Sbholler movdqa %xmm0,0x40(%rcx) 22526320Sbholler movdqa %xmm1,0x50(%rcx) 22536320Sbholler 22546320Sbholler movdqa 0x60(%rdx),%xmm2 22556320Sbholler movdqa 0x70(%rdx),%xmm3 22566320Sbholler lea 0x80(%rdx),%rdx 22576320Sbholler movdqa %xmm2,0x60(%rcx) 22586320Sbholler movdqa %xmm3,0x70(%rcx) 22596320Sbholler lea 0x80(%rcx),%rcx 22606320Sbholler jge L(both_aligned) 22616320Sbholler 22626320SbhollerL(fix_16b): 22636320Sbholler add %r8,%rcx 22646320Sbholler lea L(fwdPxQx)(%rip),%r10 22656320Sbholler add %r8,%rdx 22666320Sbholler 22676320Sbholler movslq (%r10,%r8,4),%r9 22686320Sbholler lea (%r9,%r10,1),%r10 22696320Sbholler jmpq *%r10 22706320Sbholler 22716320Sbholler .balign 16 22726320SbhollerL(Loop8byte_pre): 22736320Sbholler # Use 8-byte moves 22746320Sbholler mov .largest_level_cache_size(%rip),%r9d 22756320Sbholler shr %r9 # take half of it 22766320Sbholler cmp %r9,%r8 22776320Sbholler jg L(byte8_nt_top) 22786320Sbholler # Find out whether to use rep movsq 22796320Sbholler cmp $4096,%r8 22806320Sbholler jle L(byte8_top) 22816320Sbholler mov .amd64cache1half(%rip),%r9d # half of l1 cache 22826320Sbholler cmp %r9,%r8 22836320Sbholler jle L(use_rep) 22846320Sbholler 22856320Sbholler .balign 16 22866320SbhollerL(byte8_top): 22876320Sbholler mov (%rdx),%r9 22886320Sbholler mov 0x8(%rdx),%r10 22896320Sbholler lea -0x40(%r8),%r8 22906320Sbholler mov %r9,(%rcx) 22916320Sbholler mov %r10,0x8(%rcx) 22926320Sbholler mov 0x10(%rdx),%r11 22936320Sbholler mov 0x18(%rdx),%r9 22946320Sbholler mov %r11,0x10(%rcx) 22956320Sbholler mov %r9,0x18(%rcx) 22966320Sbholler 22976320Sbholler cmp $0x40,%r8 22986320Sbholler mov 0x20(%rdx),%r10 22996320Sbholler mov 0x28(%rdx),%r11 23006320Sbholler mov %r10,0x20(%rcx) 23016320Sbholler mov %r11,0x28(%rcx) 23026320Sbholler mov 0x30(%rdx),%r9 23036320Sbholler mov 0x38(%rdx),%r10 23046320Sbholler lea 0x40(%rdx),%rdx 23056320Sbholler mov %r9,0x30(%rcx) 23066320Sbholler mov %r10,0x38(%rcx) 23076320Sbholler lea 0x40(%rcx),%rcx 23086320Sbholler jg L(byte8_top) 23096320Sbholler 23106320SbhollerL(byte8_end): 23116320Sbholler lea L(fwdPxQx)(%rip),%r10 23126320Sbholler lea (%rdx,%r8,1),%rdx 23136320Sbholler lea (%rcx,%r8,1),%rcx 23146320Sbholler 23156320Sbholler movslq (%r10,%r8,4),%r9 23166320Sbholler lea (%r9,%r10,1),%r10 23176320Sbholler jmpq *%r10 23186320Sbholler 23196320Sbholler .balign 16 23206320SbhollerL(use_rep): 23216320Sbholler mov %rdx,%rsi # %rsi = source 23226320Sbholler mov %rcx,%rdi # %rdi = destination 23236320Sbholler mov %r8,%rcx # %rcx = count 23246320Sbholler shrq $3,%rcx # 8-byte word count 23250Sstevel@tonic-gate rep 23266320Sbholler movsq 23276320Sbholler mov %rsi,%rdx # source 23286320Sbholler mov %rdi,%rcx # destination 23296320Sbholler andq $7,%r8 # remainder 23306320Sbholler jnz L(byte8_end) 23310Sstevel@tonic-gate ret 23320Sstevel@tonic-gate 23336320Sbholler .balign 16 23346320SbhollerL(byte8_nt_top): 23356320Sbholler sub $0x40,%r8 23366320Sbholler prefetchnta 0x180(%rdx) 23376320Sbholler mov (%rdx),%r9 23386320Sbholler movnti %r9,(%rcx) 23396320Sbholler mov 0x8(%rdx),%r10 23406320Sbholler movnti %r10,0x8(%rcx) 23416320Sbholler mov 0x10(%rdx),%r11 23426320Sbholler movnti %r11,0x10(%rcx) 23436320Sbholler mov 0x18(%rdx),%r9 23446320Sbholler movnti %r9,0x18(%rcx) 23456320Sbholler mov 0x20(%rdx),%r10 23466320Sbholler movnti %r10,0x20(%rcx) 23476320Sbholler mov 0x28(%rdx),%r11 23486320Sbholler movnti %r11,0x28(%rcx) 23496320Sbholler mov 0x30(%rdx),%r9 23506320Sbholler movnti %r9,0x30(%rcx) 23516320Sbholler mov 0x38(%rdx),%r10 23526320Sbholler movnti %r10,0x38(%rcx) 23536320Sbholler 23546320Sbholler lea 0x40(%rdx),%rdx 23556320Sbholler lea 0x40(%rcx),%rcx 23566320Sbholler cmp $0x40,%r8 23576320Sbholler jge L(byte8_nt_top) 23586320Sbholler sfence 23596320Sbholler jmp L(byte8_end) 23606320Sbholler 23616320Sbholler SET_SIZE(memcpy) 23626320Sbholler 23636320Sbholler .balign 16 23646320SbhollerL(CopyBackwards): 23656320Sbholler mov %rdx,%r8 23666320Sbholler mov %rdi,%rcx 23676320Sbholler mov %rsi,%rdx 23686320Sbholler mov %rdi,%rax # return value 23696320Sbholler 23706320Sbholler # ck alignment of last byte 23716320Sbholler lea (%rcx,%r8,1),%rcx 23726320Sbholler test $0x7,%rcx 23736320Sbholler lea (%rdx,%r8,1),%rdx 23746320Sbholler jne L(bk_align) 23756320Sbholler 23766320SbhollerL(bk_qw_aligned): 23776320Sbholler lea L(bkPxQx)(%rip),%r10 23786320Sbholler 23796320Sbholler cmp $0x90,%r8 # 144 23806320Sbholler jg L(bk_ck_sse2_alignment) 23816320Sbholler 23826320Sbholler sub %r8,%rcx 23836320Sbholler sub %r8,%rdx 23846320Sbholler 23856320Sbholler movslq (%r10,%r8,4),%r9 23866320Sbholler lea (%r9,%r10,1),%r10 23876320Sbholler jmpq *%r10 23886320Sbholler 23896320Sbholler .balign 16 23906320SbhollerL(bk_align): 23916320Sbholler # only align if len > 8 23926320Sbholler cmp $8,%r8 23936320Sbholler jle L(bk_qw_aligned) 23946320Sbholler test $0x1,%rcx 23956320Sbholler je L(bk_tst2) 23966320Sbholler dec %rcx 23976320Sbholler dec %rdx 23986320Sbholler dec %r8 23996320Sbholler mov (%rdx),%r9b 24006320Sbholler mov %r9b,(%rcx) 24016320Sbholler 24026320SbhollerL(bk_tst2): 24036320Sbholler test $0x2,%rcx 24046320Sbholler je L(bk_tst3) 24056320Sbholler 24066320SbhollerL(bk_got2): 24076320Sbholler sub $0x2,%rcx 24086320Sbholler sub $0x2,%rdx 24096320Sbholler sub $0x2,%r8 24106320Sbholler movzwq (%rdx),%r9 24116320Sbholler mov %r9w,(%rcx) 24126320Sbholler 24136320SbhollerL(bk_tst3): 24146320Sbholler test $0x4,%rcx 24156320Sbholler je L(bk_qw_aligned) 24166320Sbholler 24176320SbhollerL(bk_got3): 24186320Sbholler sub $0x4,%rcx 24196320Sbholler sub $0x4,%rdx 24206320Sbholler sub $0x4,%r8 24216320Sbholler mov (%rdx),%r9d 24226320Sbholler mov %r9d,(%rcx) 24236320Sbholler jmp L(bk_qw_aligned) 24246320Sbholler 24256320Sbholler .balign 16 24266320SbhollerL(bk_ck_sse2_alignment): 24276320Sbholler cmpl $NO_SSE,.memops_method(%rip) 24286320Sbholler je L(bk_use_rep) 24296320Sbholler # check alignment of last byte 24306320Sbholler test $0xf,%rcx 24316320Sbholler jz L(bk_sse2_cpy) 24326320Sbholler 24336320SbhollerL(bk_sse2_align): 24346320Sbholler # only here if already aligned on at least a qword bndry 24356320Sbholler sub $0x8,%rcx 24366320Sbholler sub $0x8,%rdx 24376320Sbholler sub $0x8,%r8 24386320Sbholler mov (%rdx),%r9 24396320Sbholler mov %r9,(%rcx) 24406320Sbholler #jmp L(bk_sse2_cpy) 24416320Sbholler 24426320Sbholler .balign 16 24436320SbhollerL(bk_sse2_cpy): 24446320Sbholler sub $0x80,%rcx # 128 24456320Sbholler sub $0x80,%rdx 24466320Sbholler movdqu 0x70(%rdx),%xmm3 24476320Sbholler movdqu 0x60(%rdx),%xmm2 24486320Sbholler movdqa %xmm3,0x70(%rcx) 24496320Sbholler movdqa %xmm2,0x60(%rcx) 24506320Sbholler sub $0x80,%r8 24516320Sbholler movdqu 0x50(%rdx),%xmm1 24526320Sbholler movdqu 0x40(%rdx),%xmm0 24536320Sbholler movdqa %xmm1,0x50(%rcx) 24546320Sbholler movdqa %xmm0,0x40(%rcx) 24556320Sbholler 24566320Sbholler cmp $0x80,%r8 24576320Sbholler movdqu 0x30(%rdx),%xmm3 24586320Sbholler movdqu 0x20(%rdx),%xmm2 24596320Sbholler movdqa %xmm3,0x30(%rcx) 24606320Sbholler movdqa %xmm2,0x20(%rcx) 24616320Sbholler movdqu 0x10(%rdx),%xmm1 24626320Sbholler movdqu (%rdx),%xmm0 24636320Sbholler movdqa %xmm1,0x10(%rcx) 24646320Sbholler movdqa %xmm0,(%rcx) 24656320Sbholler jge L(bk_sse2_cpy) 24666320Sbholler 24676320SbhollerL(bk_sse2_cpy_end): 24686320Sbholler lea L(bkPxQx)(%rip),%r10 24696320Sbholler sub %r8,%rdx 24706320Sbholler sub %r8,%rcx 24716320Sbholler movslq (%r10,%r8,4),%r9 24726320Sbholler lea (%r9,%r10,1),%r10 24736320Sbholler jmpq *%r10 24746320Sbholler 24756320Sbholler .balign 16 24766320SbhollerL(bk_use_rep): 24776320Sbholler xchg %rcx,%r9 24786320Sbholler mov %rdx,%rsi # source 24796320Sbholler mov %r9,%rdi # destination 24806320Sbholler mov %r8,%rcx # count 24816320Sbholler sub $8,%rsi 24826320Sbholler sub $8,%rdi 24836320Sbholler shr $3,%rcx 24846320Sbholler std # reverse direction 24856320Sbholler rep 24866320Sbholler movsq 24876320Sbholler cld # reset direction flag 24886320Sbholler 24896320Sbholler xchg %rcx,%r9 24906320Sbholler lea L(bkPxQx)(%rip),%r10 24916320Sbholler sub %r8,%rdx 24926320Sbholler sub %r8,%rcx 24936320Sbholler andq $7,%r8 # remainder 24946320Sbholler jz 2f 24956320Sbholler movslq (%r10,%r8,4),%r9 24966320Sbholler lea (%r9,%r10,1),%r10 24976320Sbholler jmpq *%r10 24986320Sbholler2: 24996320Sbholler ret 25006320Sbholler 25016320Sbholler .balign 16 25026320SbhollerL(bkP0QI): 25036320Sbholler mov 0x88(%rdx),%r10 25046320Sbholler mov %r10,0x88(%rcx) 25056320SbhollerL(bkP0QH): 25066320Sbholler mov 0x80(%rdx),%r10 25076320Sbholler mov %r10,0x80(%rcx) 25086320SbhollerL(bkP0QG): 25096320Sbholler mov 0x78(%rdx),%r9 25106320Sbholler mov %r9,0x78(%rcx) 25116320SbhollerL(bkP0QF): 25126320Sbholler mov 0x70(%rdx),%r11 25136320Sbholler mov %r11,0x70(%rcx) 25146320SbhollerL(bkP0QE): 25156320Sbholler mov 0x68(%rdx),%r10 25166320Sbholler mov %r10,0x68(%rcx) 25176320SbhollerL(bkP0QD): 25186320Sbholler mov 0x60(%rdx),%r9 25196320Sbholler mov %r9,0x60(%rcx) 25206320SbhollerL(bkP0QC): 25216320Sbholler mov 0x58(%rdx),%r11 25226320Sbholler mov %r11,0x58(%rcx) 25236320SbhollerL(bkP0QB): 25246320Sbholler mov 0x50(%rdx),%r10 25256320Sbholler mov %r10,0x50(%rcx) 25266320SbhollerL(bkP0QA): 25276320Sbholler mov 0x48(%rdx),%r9 25286320Sbholler mov %r9,0x48(%rcx) 25296320SbhollerL(bkP0Q9): 25306320Sbholler mov 0x40(%rdx),%r11 25316320Sbholler mov %r11,0x40(%rcx) 25326320SbhollerL(bkP0Q8): 25336320Sbholler mov 0x38(%rdx),%r10 25346320Sbholler mov %r10,0x38(%rcx) 25356320SbhollerL(bkP0Q7): 25366320Sbholler mov 0x30(%rdx),%r9 25376320Sbholler mov %r9,0x30(%rcx) 25386320SbhollerL(bkP0Q6): 25396320Sbholler mov 0x28(%rdx),%r11 25406320Sbholler mov %r11,0x28(%rcx) 25416320SbhollerL(bkP0Q5): 25426320Sbholler mov 0x20(%rdx),%r10 25436320Sbholler mov %r10,0x20(%rcx) 25446320SbhollerL(bkP0Q4): 25456320Sbholler mov 0x18(%rdx),%r9 25466320Sbholler mov %r9,0x18(%rcx) 25476320SbhollerL(bkP0Q3): 25486320Sbholler mov 0x10(%rdx),%r11 25496320Sbholler mov %r11,0x10(%rcx) 25506320SbhollerL(bkP0Q2): 25516320Sbholler mov 0x8(%rdx),%r10 25526320Sbholler mov %r10,0x8(%rcx) 25536320SbhollerL(bkP0Q1): 25546320Sbholler mov (%rdx),%r9 25556320Sbholler mov %r9,(%rcx) 25566320SbhollerL(bkP0Q0): 25576320Sbholler ret 25586320Sbholler 25596320Sbholler .balign 16 25606320SbhollerL(bkP1QI): 25616320Sbholler mov 0x89(%rdx),%r10 25626320Sbholler mov %r10,0x89(%rcx) 25636320SbhollerL(bkP1QH): 25646320Sbholler mov 0x81(%rdx),%r11 25656320Sbholler mov %r11,0x81(%rcx) 25666320SbhollerL(bkP1QG): 25676320Sbholler mov 0x79(%rdx),%r10 25686320Sbholler mov %r10,0x79(%rcx) 25696320SbhollerL(bkP1QF): 25706320Sbholler mov 0x71(%rdx),%r9 25716320Sbholler mov %r9,0x71(%rcx) 25726320SbhollerL(bkP1QE): 25736320Sbholler mov 0x69(%rdx),%r11 25746320Sbholler mov %r11,0x69(%rcx) 25756320SbhollerL(bkP1QD): 25766320Sbholler mov 0x61(%rdx),%r10 25776320Sbholler mov %r10,0x61(%rcx) 25786320SbhollerL(bkP1QC): 25796320Sbholler mov 0x59(%rdx),%r9 25806320Sbholler mov %r9,0x59(%rcx) 25816320SbhollerL(bkP1QB): 25826320Sbholler mov 0x51(%rdx),%r11 25836320Sbholler mov %r11,0x51(%rcx) 25846320SbhollerL(bkP1QA): 25856320Sbholler mov 0x49(%rdx),%r10 25866320Sbholler mov %r10,0x49(%rcx) 25876320SbhollerL(bkP1Q9): 25886320Sbholler mov 0x41(%rdx),%r9 25896320Sbholler mov %r9,0x41(%rcx) 25906320SbhollerL(bkP1Q8): 25916320Sbholler mov 0x39(%rdx),%r11 25926320Sbholler mov %r11,0x39(%rcx) 25936320SbhollerL(bkP1Q7): 25946320Sbholler mov 0x31(%rdx),%r10 25956320Sbholler mov %r10,0x31(%rcx) 25966320SbhollerL(bkP1Q6): 25976320Sbholler mov 0x29(%rdx),%r9 25986320Sbholler mov %r9,0x29(%rcx) 25996320SbhollerL(bkP1Q5): 26006320Sbholler mov 0x21(%rdx),%r11 26016320Sbholler mov %r11,0x21(%rcx) 26026320SbhollerL(bkP1Q4): 26036320Sbholler mov 0x19(%rdx),%r10 26046320Sbholler mov %r10,0x19(%rcx) 26056320SbhollerL(bkP1Q3): 26066320Sbholler mov 0x11(%rdx),%r9 26076320Sbholler mov %r9,0x11(%rcx) 26086320SbhollerL(bkP1Q2): 26096320Sbholler mov 0x9(%rdx),%r11 26106320Sbholler mov %r11,0x9(%rcx) 26116320SbhollerL(bkP1Q1): 26126320Sbholler mov 0x1(%rdx),%r10 26136320Sbholler mov %r10,0x1(%rcx) 26146320SbhollerL(bkP1Q0): 26156320Sbholler mov (%rdx),%r9b 26166320Sbholler mov %r9b,(%rcx) 26176320Sbholler ret 26186320Sbholler 26196320Sbholler .balign 16 26206320SbhollerL(bkP2QI): 26216320Sbholler mov 0x8a(%rdx),%r10 26226320Sbholler mov %r10,0x8a(%rcx) 26236320SbhollerL(bkP2QH): 26246320Sbholler mov 0x82(%rdx),%r11 26256320Sbholler mov %r11,0x82(%rcx) 26266320SbhollerL(bkP2QG): 26276320Sbholler mov 0x7a(%rdx),%r10 26286320Sbholler mov %r10,0x7a(%rcx) 26296320SbhollerL(bkP2QF): 26306320Sbholler mov 0x72(%rdx),%r9 26316320Sbholler mov %r9,0x72(%rcx) 26326320SbhollerL(bkP2QE): 26336320Sbholler mov 0x6a(%rdx),%r11 26346320Sbholler mov %r11,0x6a(%rcx) 26356320SbhollerL(bkP2QD): 26366320Sbholler mov 0x62(%rdx),%r10 26376320Sbholler mov %r10,0x62(%rcx) 26386320SbhollerL(bkP2QC): 26396320Sbholler mov 0x5a(%rdx),%r9 26406320Sbholler mov %r9,0x5a(%rcx) 26416320SbhollerL(bkP2QB): 26426320Sbholler mov 0x52(%rdx),%r11 26436320Sbholler mov %r11,0x52(%rcx) 26446320SbhollerL(bkP2QA): 26456320Sbholler mov 0x4a(%rdx),%r10 26466320Sbholler mov %r10,0x4a(%rcx) 26476320SbhollerL(bkP2Q9): 26486320Sbholler mov 0x42(%rdx),%r9 26496320Sbholler mov %r9,0x42(%rcx) 26506320SbhollerL(bkP2Q8): 26516320Sbholler mov 0x3a(%rdx),%r11 26526320Sbholler mov %r11,0x3a(%rcx) 26536320SbhollerL(bkP2Q7): 26546320Sbholler mov 0x32(%rdx),%r10 26556320Sbholler mov %r10,0x32(%rcx) 26566320SbhollerL(bkP2Q6): 26576320Sbholler mov 0x2a(%rdx),%r9 26586320Sbholler mov %r9,0x2a(%rcx) 26596320SbhollerL(bkP2Q5): 26606320Sbholler mov 0x22(%rdx),%r11 26616320Sbholler mov %r11,0x22(%rcx) 26626320SbhollerL(bkP2Q4): 26636320Sbholler mov 0x1a(%rdx),%r10 26646320Sbholler mov %r10,0x1a(%rcx) 26656320SbhollerL(bkP2Q3): 26666320Sbholler mov 0x12(%rdx),%r9 26676320Sbholler mov %r9,0x12(%rcx) 26686320SbhollerL(bkP2Q2): 26696320Sbholler mov 0xa(%rdx),%r11 26706320Sbholler mov %r11,0xa(%rcx) 26716320SbhollerL(bkP2Q1): 26726320Sbholler mov 0x2(%rdx),%r10 26736320Sbholler mov %r10,0x2(%rcx) 26746320SbhollerL(bkP2Q0): 26756320Sbholler mov (%rdx),%r9w 26766320Sbholler mov %r9w,(%rcx) 26776320Sbholler ret 26786320Sbholler 26796320Sbholler .balign 16 26806320SbhollerL(bkP3QI): 26816320Sbholler mov 0x8b(%rdx),%r10 26826320Sbholler mov %r10,0x8b(%rcx) 26836320SbhollerL(bkP3QH): 26846320Sbholler mov 0x83(%rdx),%r11 26856320Sbholler mov %r11,0x83(%rcx) 26866320SbhollerL(bkP3QG): 26876320Sbholler mov 0x7b(%rdx),%r10 26886320Sbholler mov %r10,0x7b(%rcx) 26896320SbhollerL(bkP3QF): 26906320Sbholler mov 0x73(%rdx),%r9 26916320Sbholler mov %r9,0x73(%rcx) 26926320SbhollerL(bkP3QE): 26936320Sbholler mov 0x6b(%rdx),%r11 26946320Sbholler mov %r11,0x6b(%rcx) 26956320SbhollerL(bkP3QD): 26966320Sbholler mov 0x63(%rdx),%r10 26976320Sbholler mov %r10,0x63(%rcx) 26986320SbhollerL(bkP3QC): 26996320Sbholler mov 0x5b(%rdx),%r9 27006320Sbholler mov %r9,0x5b(%rcx) 27016320SbhollerL(bkP3QB): 27026320Sbholler mov 0x53(%rdx),%r11 27036320Sbholler mov %r11,0x53(%rcx) 27046320SbhollerL(bkP3QA): 27056320Sbholler mov 0x4b(%rdx),%r10 27066320Sbholler mov %r10,0x4b(%rcx) 27076320SbhollerL(bkP3Q9): 27086320Sbholler mov 0x43(%rdx),%r9 27096320Sbholler mov %r9,0x43(%rcx) 27106320SbhollerL(bkP3Q8): 27116320Sbholler mov 0x3b(%rdx),%r11 27126320Sbholler mov %r11,0x3b(%rcx) 27136320SbhollerL(bkP3Q7): 27146320Sbholler mov 0x33(%rdx),%r10 27156320Sbholler mov %r10,0x33(%rcx) 27166320SbhollerL(bkP3Q6): 27176320Sbholler mov 0x2b(%rdx),%r9 27186320Sbholler mov %r9,0x2b(%rcx) 27196320SbhollerL(bkP3Q5): 27206320Sbholler mov 0x23(%rdx),%r11 27216320Sbholler mov %r11,0x23(%rcx) 27226320SbhollerL(bkP3Q4): 27236320Sbholler mov 0x1b(%rdx),%r10 27246320Sbholler mov %r10,0x1b(%rcx) 27256320SbhollerL(bkP3Q3): 27266320Sbholler mov 0x13(%rdx),%r9 27276320Sbholler mov %r9,0x13(%rcx) 27286320SbhollerL(bkP3Q2): 27296320Sbholler mov 0xb(%rdx),%r11 27306320Sbholler mov %r11,0xb(%rcx) 27316320SbhollerL(bkP3Q1): 27326320Sbholler mov 0x3(%rdx),%r10 27336320Sbholler mov %r10,0x3(%rcx) 27346320SbhollerL(bkP3Q0): # trailing loads/stores do all their loads 1st, then do the stores 27356320Sbholler mov 0x1(%rdx),%r9w 27366320Sbholler mov %r9w,0x1(%rcx) 27376320Sbholler mov (%rdx),%r10b 27386320Sbholler mov %r10b,(%rcx) 27396320Sbholler ret 27406320Sbholler 27416320Sbholler .balign 16 27426320SbhollerL(bkP4QI): 27436320Sbholler mov 0x8c(%rdx),%r10 27446320Sbholler mov %r10,0x8c(%rcx) 27456320SbhollerL(bkP4QH): 27466320Sbholler mov 0x84(%rdx),%r11 27476320Sbholler mov %r11,0x84(%rcx) 27486320SbhollerL(bkP4QG): 27496320Sbholler mov 0x7c(%rdx),%r10 27506320Sbholler mov %r10,0x7c(%rcx) 27516320SbhollerL(bkP4QF): 27526320Sbholler mov 0x74(%rdx),%r9 27536320Sbholler mov %r9,0x74(%rcx) 27546320SbhollerL(bkP4QE): 27556320Sbholler mov 0x6c(%rdx),%r11 27566320Sbholler mov %r11,0x6c(%rcx) 27576320SbhollerL(bkP4QD): 27586320Sbholler mov 0x64(%rdx),%r10 27596320Sbholler mov %r10,0x64(%rcx) 27606320SbhollerL(bkP4QC): 27616320Sbholler mov 0x5c(%rdx),%r9 27626320Sbholler mov %r9,0x5c(%rcx) 27636320SbhollerL(bkP4QB): 27646320Sbholler mov 0x54(%rdx),%r11 27656320Sbholler mov %r11,0x54(%rcx) 27666320SbhollerL(bkP4QA): 27676320Sbholler mov 0x4c(%rdx),%r10 27686320Sbholler mov %r10,0x4c(%rcx) 27696320SbhollerL(bkP4Q9): 27706320Sbholler mov 0x44(%rdx),%r9 27716320Sbholler mov %r9,0x44(%rcx) 27726320SbhollerL(bkP4Q8): 27736320Sbholler mov 0x3c(%rdx),%r11 27746320Sbholler mov %r11,0x3c(%rcx) 27756320SbhollerL(bkP4Q7): 27766320Sbholler mov 0x34(%rdx),%r10 27776320Sbholler mov %r10,0x34(%rcx) 27786320SbhollerL(bkP4Q6): 27796320Sbholler mov 0x2c(%rdx),%r9 27806320Sbholler mov %r9,0x2c(%rcx) 27816320SbhollerL(bkP4Q5): 27826320Sbholler mov 0x24(%rdx),%r11 27836320Sbholler mov %r11,0x24(%rcx) 27846320SbhollerL(bkP4Q4): 27856320Sbholler mov 0x1c(%rdx),%r10 27866320Sbholler mov %r10,0x1c(%rcx) 27876320SbhollerL(bkP4Q3): 27886320Sbholler mov 0x14(%rdx),%r9 27896320Sbholler mov %r9,0x14(%rcx) 27906320SbhollerL(bkP4Q2): 27916320Sbholler mov 0xc(%rdx),%r11 27926320Sbholler mov %r11,0xc(%rcx) 27936320SbhollerL(bkP4Q1): 27946320Sbholler mov 0x4(%rdx),%r10 27956320Sbholler mov %r10,0x4(%rcx) 27966320SbhollerL(bkP4Q0): 27976320Sbholler mov (%rdx),%r9d 27986320Sbholler mov %r9d,(%rcx) 27996320Sbholler ret 28006320Sbholler 28016320Sbholler .balign 16 28026320SbhollerL(bkP5QI): 28036320Sbholler mov 0x8d(%rdx),%r10 28046320Sbholler mov %r10,0x8d(%rcx) 28056320SbhollerL(bkP5QH): 28066320Sbholler mov 0x85(%rdx),%r9 28076320Sbholler mov %r9,0x85(%rcx) 28086320SbhollerL(bkP5QG): 28096320Sbholler mov 0x7d(%rdx),%r11 28106320Sbholler mov %r11,0x7d(%rcx) 28116320SbhollerL(bkP5QF): 28126320Sbholler mov 0x75(%rdx),%r10 28136320Sbholler mov %r10,0x75(%rcx) 28146320SbhollerL(bkP5QE): 28156320Sbholler mov 0x6d(%rdx),%r9 28166320Sbholler mov %r9,0x6d(%rcx) 28176320SbhollerL(bkP5QD): 28186320Sbholler mov 0x65(%rdx),%r11 28196320Sbholler mov %r11,0x65(%rcx) 28206320SbhollerL(bkP5QC): 28216320Sbholler mov 0x5d(%rdx),%r10 28226320Sbholler mov %r10,0x5d(%rcx) 28236320SbhollerL(bkP5QB): 28246320Sbholler mov 0x55(%rdx),%r9 28256320Sbholler mov %r9,0x55(%rcx) 28266320SbhollerL(bkP5QA): 28276320Sbholler mov 0x4d(%rdx),%r11 28286320Sbholler mov %r11,0x4d(%rcx) 28296320SbhollerL(bkP5Q9): 28306320Sbholler mov 0x45(%rdx),%r10 28316320Sbholler mov %r10,0x45(%rcx) 28326320SbhollerL(bkP5Q8): 28336320Sbholler mov 0x3d(%rdx),%r9 28346320Sbholler mov %r9,0x3d(%rcx) 28356320SbhollerL(bkP5Q7): 28366320Sbholler mov 0x35(%rdx),%r11 28376320Sbholler mov %r11,0x35(%rcx) 28386320SbhollerL(bkP5Q6): 28396320Sbholler mov 0x2d(%rdx),%r10 28406320Sbholler mov %r10,0x2d(%rcx) 28416320SbhollerL(bkP5Q5): 28426320Sbholler mov 0x25(%rdx),%r9 28436320Sbholler mov %r9,0x25(%rcx) 28446320SbhollerL(bkP5Q4): 28456320Sbholler mov 0x1d(%rdx),%r11 28466320Sbholler mov %r11,0x1d(%rcx) 28476320SbhollerL(bkP5Q3): 28486320Sbholler mov 0x15(%rdx),%r10 28496320Sbholler mov %r10,0x15(%rcx) 28506320SbhollerL(bkP5Q2): 28516320Sbholler mov 0xd(%rdx),%r9 28526320Sbholler mov %r9,0xd(%rcx) 28536320SbhollerL(bkP5Q1): 28546320Sbholler mov 0x5(%rdx),%r11 28556320Sbholler mov %r11,0x5(%rcx) 28566320SbhollerL(bkP5Q0): # trailing loads/stores do all their loads 1st, then do the stores 28576320Sbholler mov 0x1(%rdx),%r9d 28586320Sbholler mov %r9d,0x1(%rcx) 28596320Sbholler mov (%rdx),%r10b 28606320Sbholler mov %r10b,(%rcx) 28616320Sbholler ret 28626320Sbholler 28636320Sbholler .balign 16 28646320SbhollerL(bkP6QI): 28656320Sbholler mov 0x8e(%rdx),%r10 28666320Sbholler mov %r10,0x8e(%rcx) 28676320SbhollerL(bkP6QH): 28686320Sbholler mov 0x86(%rdx),%r11 28696320Sbholler mov %r11,0x86(%rcx) 28706320SbhollerL(bkP6QG): 28716320Sbholler mov 0x7e(%rdx),%r10 28726320Sbholler mov %r10,0x7e(%rcx) 28736320SbhollerL(bkP6QF): 28746320Sbholler mov 0x76(%rdx),%r9 28756320Sbholler mov %r9,0x76(%rcx) 28766320SbhollerL(bkP6QE): 28776320Sbholler mov 0x6e(%rdx),%r11 28786320Sbholler mov %r11,0x6e(%rcx) 28796320SbhollerL(bkP6QD): 28806320Sbholler mov 0x66(%rdx),%r10 28816320Sbholler mov %r10,0x66(%rcx) 28826320SbhollerL(bkP6QC): 28836320Sbholler mov 0x5e(%rdx),%r9 28846320Sbholler mov %r9,0x5e(%rcx) 28856320SbhollerL(bkP6QB): 28866320Sbholler mov 0x56(%rdx),%r11 28876320Sbholler mov %r11,0x56(%rcx) 28886320SbhollerL(bkP6QA): 28896320Sbholler mov 0x4e(%rdx),%r10 28906320Sbholler mov %r10,0x4e(%rcx) 28916320SbhollerL(bkP6Q9): 28926320Sbholler mov 0x46(%rdx),%r9 28936320Sbholler mov %r9,0x46(%rcx) 28946320SbhollerL(bkP6Q8): 28956320Sbholler mov 0x3e(%rdx),%r11 28966320Sbholler mov %r11,0x3e(%rcx) 28976320SbhollerL(bkP6Q7): 28986320Sbholler mov 0x36(%rdx),%r10 28996320Sbholler mov %r10,0x36(%rcx) 29006320SbhollerL(bkP6Q6): 29016320Sbholler mov 0x2e(%rdx),%r9 29026320Sbholler mov %r9,0x2e(%rcx) 29036320SbhollerL(bkP6Q5): 29046320Sbholler mov 0x26(%rdx),%r11 29056320Sbholler mov %r11,0x26(%rcx) 29066320SbhollerL(bkP6Q4): 29076320Sbholler mov 0x1e(%rdx),%r10 29086320Sbholler mov %r10,0x1e(%rcx) 29096320SbhollerL(bkP6Q3): 29106320Sbholler mov 0x16(%rdx),%r9 29116320Sbholler mov %r9,0x16(%rcx) 29126320SbhollerL(bkP6Q2): 29136320Sbholler mov 0xe(%rdx),%r11 29146320Sbholler mov %r11,0xe(%rcx) 29156320SbhollerL(bkP6Q1): 29166320Sbholler mov 0x6(%rdx),%r10 29176320Sbholler mov %r10,0x6(%rcx) 29186320SbhollerL(bkP6Q0): # trailing loads/stores do all their loads 1st, then do the stores 29196320Sbholler mov 0x2(%rdx),%r9d 29206320Sbholler mov %r9d,0x2(%rcx) 29216320Sbholler mov (%rdx),%r10w 29226320Sbholler mov %r10w,(%rcx) 29236320Sbholler ret 29246320Sbholler 29256320Sbholler .balign 16 29266320SbhollerL(bkP7QI): 29276320Sbholler mov 0x8f(%rdx),%r10 29286320Sbholler mov %r10,0x8f(%rcx) 29296320SbhollerL(bkP7QH): 29306320Sbholler mov 0x87(%rdx),%r11 29316320Sbholler mov %r11,0x87(%rcx) 29326320SbhollerL(bkP7QG): 29336320Sbholler mov 0x7f(%rdx),%r10 29346320Sbholler mov %r10,0x7f(%rcx) 29356320SbhollerL(bkP7QF): 29366320Sbholler mov 0x77(%rdx),%r9 29376320Sbholler mov %r9,0x77(%rcx) 29386320SbhollerL(bkP7QE): 29396320Sbholler mov 0x6f(%rdx),%r11 29406320Sbholler mov %r11,0x6f(%rcx) 29416320SbhollerL(bkP7QD): 29426320Sbholler mov 0x67(%rdx),%r10 29436320Sbholler mov %r10,0x67(%rcx) 29446320SbhollerL(bkP7QC): 29456320Sbholler mov 0x5f(%rdx),%r9 29466320Sbholler mov %r9,0x5f(%rcx) 29476320SbhollerL(bkP7QB): 29486320Sbholler mov 0x57(%rdx),%r11 29496320Sbholler mov %r11,0x57(%rcx) 29506320SbhollerL(bkP7QA): 29516320Sbholler mov 0x4f(%rdx),%r10 29526320Sbholler mov %r10,0x4f(%rcx) 29536320SbhollerL(bkP7Q9): 29546320Sbholler mov 0x47(%rdx),%r9 29556320Sbholler mov %r9,0x47(%rcx) 29566320SbhollerL(bkP7Q8): 29576320Sbholler mov 0x3f(%rdx),%r11 29586320Sbholler mov %r11,0x3f(%rcx) 29596320SbhollerL(bkP7Q7): 29606320Sbholler mov 0x37(%rdx),%r10 29616320Sbholler mov %r10,0x37(%rcx) 29626320SbhollerL(bkP7Q6): 29636320Sbholler mov 0x2f(%rdx),%r9 29646320Sbholler mov %r9,0x2f(%rcx) 29656320SbhollerL(bkP7Q5): 29666320Sbholler mov 0x27(%rdx),%r11 29676320Sbholler mov %r11,0x27(%rcx) 29686320SbhollerL(bkP7Q4): 29696320Sbholler mov 0x1f(%rdx),%r10 29706320Sbholler mov %r10,0x1f(%rcx) 29716320SbhollerL(bkP7Q3): 29726320Sbholler mov 0x17(%rdx),%r9 29736320Sbholler mov %r9,0x17(%rcx) 29746320SbhollerL(bkP7Q2): 29756320Sbholler mov 0xf(%rdx),%r11 29766320Sbholler mov %r11,0xf(%rcx) 29776320SbhollerL(bkP7Q1): 29786320Sbholler mov 0x7(%rdx),%r10 29796320Sbholler mov %r10,0x7(%rcx) 29806320SbhollerL(bkP7Q0): # trailing loads/stores do all their loads 1st, then do the stores 29816320Sbholler mov 0x3(%rdx),%r9d 29826320Sbholler mov %r9d,0x3(%rcx) 29836320Sbholler mov 0x1(%rdx),%r10w 29846320Sbholler mov %r10w,0x1(%rcx) 29856320Sbholler mov (%rdx),%r11b 29866320Sbholler mov %r11b,(%rcx) 29876320Sbholler ret 29886320Sbholler 29896320Sbholler .balign 16 29906320SbhollerL(bkPxQx): .int L(bkP0Q0)-L(bkPxQx) 29916320Sbholler .int L(bkP1Q0)-L(bkPxQx) 29926320Sbholler .int L(bkP2Q0)-L(bkPxQx) 29936320Sbholler .int L(bkP3Q0)-L(bkPxQx) 29946320Sbholler .int L(bkP4Q0)-L(bkPxQx) 29956320Sbholler .int L(bkP5Q0)-L(bkPxQx) 29966320Sbholler .int L(bkP6Q0)-L(bkPxQx) 29976320Sbholler .int L(bkP7Q0)-L(bkPxQx) 29986320Sbholler 29996320Sbholler .int L(bkP0Q1)-L(bkPxQx) 30006320Sbholler .int L(bkP1Q1)-L(bkPxQx) 30016320Sbholler .int L(bkP2Q1)-L(bkPxQx) 30026320Sbholler .int L(bkP3Q1)-L(bkPxQx) 30036320Sbholler .int L(bkP4Q1)-L(bkPxQx) 30046320Sbholler .int L(bkP5Q1)-L(bkPxQx) 30056320Sbholler .int L(bkP6Q1)-L(bkPxQx) 30066320Sbholler .int L(bkP7Q1)-L(bkPxQx) 30076320Sbholler 30086320Sbholler .int L(bkP0Q2)-L(bkPxQx) 30096320Sbholler .int L(bkP1Q2)-L(bkPxQx) 30106320Sbholler .int L(bkP2Q2)-L(bkPxQx) 30116320Sbholler .int L(bkP3Q2)-L(bkPxQx) 30126320Sbholler .int L(bkP4Q2)-L(bkPxQx) 30136320Sbholler .int L(bkP5Q2)-L(bkPxQx) 30146320Sbholler .int L(bkP6Q2)-L(bkPxQx) 30156320Sbholler .int L(bkP7Q2)-L(bkPxQx) 30166320Sbholler 30176320Sbholler .int L(bkP0Q3)-L(bkPxQx) 30186320Sbholler .int L(bkP1Q3)-L(bkPxQx) 30196320Sbholler .int L(bkP2Q3)-L(bkPxQx) 30206320Sbholler .int L(bkP3Q3)-L(bkPxQx) 30216320Sbholler .int L(bkP4Q3)-L(bkPxQx) 30226320Sbholler .int L(bkP5Q3)-L(bkPxQx) 30236320Sbholler .int L(bkP6Q3)-L(bkPxQx) 30246320Sbholler .int L(bkP7Q3)-L(bkPxQx) 30256320Sbholler 30266320Sbholler .int L(bkP0Q4)-L(bkPxQx) 30276320Sbholler .int L(bkP1Q4)-L(bkPxQx) 30286320Sbholler .int L(bkP2Q4)-L(bkPxQx) 30296320Sbholler .int L(bkP3Q4)-L(bkPxQx) 30306320Sbholler .int L(bkP4Q4)-L(bkPxQx) 30316320Sbholler .int L(bkP5Q4)-L(bkPxQx) 30326320Sbholler .int L(bkP6Q4)-L(bkPxQx) 30336320Sbholler .int L(bkP7Q4)-L(bkPxQx) 30346320Sbholler 30356320Sbholler .int L(bkP0Q5)-L(bkPxQx) 30366320Sbholler .int L(bkP1Q5)-L(bkPxQx) 30376320Sbholler .int L(bkP2Q5)-L(bkPxQx) 30386320Sbholler .int L(bkP3Q5)-L(bkPxQx) 30396320Sbholler .int L(bkP4Q5)-L(bkPxQx) 30406320Sbholler .int L(bkP5Q5)-L(bkPxQx) 30416320Sbholler .int L(bkP6Q5)-L(bkPxQx) 30426320Sbholler .int L(bkP7Q5)-L(bkPxQx) 30436320Sbholler 30446320Sbholler .int L(bkP0Q6)-L(bkPxQx) 30456320Sbholler .int L(bkP1Q6)-L(bkPxQx) 30466320Sbholler .int L(bkP2Q6)-L(bkPxQx) 30476320Sbholler .int L(bkP3Q6)-L(bkPxQx) 30486320Sbholler .int L(bkP4Q6)-L(bkPxQx) 30496320Sbholler .int L(bkP5Q6)-L(bkPxQx) 30506320Sbholler .int L(bkP6Q6)-L(bkPxQx) 30516320Sbholler .int L(bkP7Q6)-L(bkPxQx) 30526320Sbholler 30536320Sbholler .int L(bkP0Q7)-L(bkPxQx) 30546320Sbholler .int L(bkP1Q7)-L(bkPxQx) 30556320Sbholler .int L(bkP2Q7)-L(bkPxQx) 30566320Sbholler .int L(bkP3Q7)-L(bkPxQx) 30576320Sbholler .int L(bkP4Q7)-L(bkPxQx) 30586320Sbholler .int L(bkP5Q7)-L(bkPxQx) 30596320Sbholler .int L(bkP6Q7)-L(bkPxQx) 30606320Sbholler .int L(bkP7Q7)-L(bkPxQx) 30616320Sbholler 30626320Sbholler .int L(bkP0Q8)-L(bkPxQx) 30636320Sbholler .int L(bkP1Q8)-L(bkPxQx) 30646320Sbholler .int L(bkP2Q8)-L(bkPxQx) 30656320Sbholler .int L(bkP3Q8)-L(bkPxQx) 30666320Sbholler .int L(bkP4Q8)-L(bkPxQx) 30676320Sbholler .int L(bkP5Q8)-L(bkPxQx) 30686320Sbholler .int L(bkP6Q8)-L(bkPxQx) 30696320Sbholler .int L(bkP7Q8)-L(bkPxQx) 30706320Sbholler 30716320Sbholler .int L(bkP0Q9)-L(bkPxQx) 30726320Sbholler .int L(bkP1Q9)-L(bkPxQx) 30736320Sbholler .int L(bkP2Q9)-L(bkPxQx) 30746320Sbholler .int L(bkP3Q9)-L(bkPxQx) 30756320Sbholler .int L(bkP4Q9)-L(bkPxQx) 30766320Sbholler .int L(bkP5Q9)-L(bkPxQx) 30776320Sbholler .int L(bkP6Q9)-L(bkPxQx) 30786320Sbholler .int L(bkP7Q9)-L(bkPxQx) 30796320Sbholler 30806320Sbholler .int L(bkP0QA)-L(bkPxQx) 30816320Sbholler .int L(bkP1QA)-L(bkPxQx) 30826320Sbholler .int L(bkP2QA)-L(bkPxQx) 30836320Sbholler .int L(bkP3QA)-L(bkPxQx) 30846320Sbholler .int L(bkP4QA)-L(bkPxQx) 30856320Sbholler .int L(bkP5QA)-L(bkPxQx) 30866320Sbholler .int L(bkP6QA)-L(bkPxQx) 30876320Sbholler .int L(bkP7QA)-L(bkPxQx) 30886320Sbholler 30896320Sbholler .int L(bkP0QB)-L(bkPxQx) 30906320Sbholler .int L(bkP1QB)-L(bkPxQx) 30916320Sbholler .int L(bkP2QB)-L(bkPxQx) 30926320Sbholler .int L(bkP3QB)-L(bkPxQx) 30936320Sbholler .int L(bkP4QB)-L(bkPxQx) 30946320Sbholler .int L(bkP5QB)-L(bkPxQx) 30956320Sbholler .int L(bkP6QB)-L(bkPxQx) 30966320Sbholler .int L(bkP7QB)-L(bkPxQx) 30976320Sbholler 30986320Sbholler .int L(bkP0QC)-L(bkPxQx) 30996320Sbholler .int L(bkP1QC)-L(bkPxQx) 31006320Sbholler .int L(bkP2QC)-L(bkPxQx) 31016320Sbholler .int L(bkP3QC)-L(bkPxQx) 31026320Sbholler .int L(bkP4QC)-L(bkPxQx) 31036320Sbholler .int L(bkP5QC)-L(bkPxQx) 31046320Sbholler .int L(bkP6QC)-L(bkPxQx) 31056320Sbholler .int L(bkP7QC)-L(bkPxQx) 31066320Sbholler 31076320Sbholler .int L(bkP0QD)-L(bkPxQx) 31086320Sbholler .int L(bkP1QD)-L(bkPxQx) 31096320Sbholler .int L(bkP2QD)-L(bkPxQx) 31106320Sbholler .int L(bkP3QD)-L(bkPxQx) 31116320Sbholler .int L(bkP4QD)-L(bkPxQx) 31126320Sbholler .int L(bkP5QD)-L(bkPxQx) 31136320Sbholler .int L(bkP6QD)-L(bkPxQx) 31146320Sbholler .int L(bkP7QD)-L(bkPxQx) 31156320Sbholler 31166320Sbholler .int L(bkP0QE)-L(bkPxQx) 31176320Sbholler .int L(bkP1QE)-L(bkPxQx) 31186320Sbholler .int L(bkP2QE)-L(bkPxQx) 31196320Sbholler .int L(bkP3QE)-L(bkPxQx) 31206320Sbholler .int L(bkP4QE)-L(bkPxQx) 31216320Sbholler .int L(bkP5QE)-L(bkPxQx) 31226320Sbholler .int L(bkP6QE)-L(bkPxQx) 31236320Sbholler .int L(bkP7QE)-L(bkPxQx) 31246320Sbholler 31256320Sbholler .int L(bkP0QF)-L(bkPxQx) 31266320Sbholler .int L(bkP1QF)-L(bkPxQx) 31276320Sbholler .int L(bkP2QF)-L(bkPxQx) 31286320Sbholler .int L(bkP3QF)-L(bkPxQx) 31296320Sbholler .int L(bkP4QF)-L(bkPxQx) 31306320Sbholler .int L(bkP5QF)-L(bkPxQx) 31316320Sbholler .int L(bkP6QF)-L(bkPxQx) 31326320Sbholler .int L(bkP7QF)-L(bkPxQx) 31336320Sbholler 31346320Sbholler .int L(bkP0QG)-L(bkPxQx) 31356320Sbholler .int L(bkP1QG)-L(bkPxQx) 31366320Sbholler .int L(bkP2QG)-L(bkPxQx) 31376320Sbholler .int L(bkP3QG)-L(bkPxQx) 31386320Sbholler .int L(bkP4QG)-L(bkPxQx) 31396320Sbholler .int L(bkP5QG)-L(bkPxQx) 31406320Sbholler .int L(bkP6QG)-L(bkPxQx) 31416320Sbholler .int L(bkP7QG)-L(bkPxQx) 31426320Sbholler 31436320Sbholler .int L(bkP0QH)-L(bkPxQx) 31446320Sbholler .int L(bkP1QH)-L(bkPxQx) 31456320Sbholler .int L(bkP2QH)-L(bkPxQx) 31466320Sbholler .int L(bkP3QH)-L(bkPxQx) 31476320Sbholler .int L(bkP4QH)-L(bkPxQx) 31486320Sbholler .int L(bkP5QH)-L(bkPxQx) 31496320Sbholler .int L(bkP6QH)-L(bkPxQx) 31506320Sbholler .int L(bkP7QH)-L(bkPxQx) 31516320Sbholler 31526320Sbholler .int L(bkP0QI)-L(bkPxQx) 31536320Sbholler .int L(bkP1QI)-L(bkPxQx) 31546320Sbholler .int L(bkP2QI)-L(bkPxQx) 31556320Sbholler .int L(bkP3QI)-L(bkPxQx) 31566320Sbholler .int L(bkP4QI)-L(bkPxQx) 31576320Sbholler .int L(bkP5QI)-L(bkPxQx) 31586320Sbholler .int L(bkP6QI)-L(bkPxQx) 31596320Sbholler .int L(bkP7QI)-L(bkPxQx) 31606320Sbholler 31610Sstevel@tonic-gate SET_SIZE(memmove) 3162