10Sstevel@tonic-gate/* 26320Sbholler * CDDL HEADER START 36320Sbholler * 46320Sbholler * The contents of this file are subject to the terms of the 56320Sbholler * Common Development and Distribution License (the "License"). 66320Sbholler * You may not use this file except in compliance with the License. 76320Sbholler * 86320Sbholler * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 96320Sbholler * or http://www.opensolaris.org/os/licensing. 106320Sbholler * See the License for the specific language governing permissions 116320Sbholler * and limitations under the License. 126320Sbholler * 136320Sbholler * When distributing Covered Code, include this CDDL HEADER in each 146320Sbholler * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 156320Sbholler * If applicable, add the following below this CDDL HEADER, with the 166320Sbholler * fields enclosed by brackets "[]" replaced with your own identifying 176320Sbholler * information: Portions Copyright [yyyy] [name of copyright owner] 186320Sbholler * 196320Sbholler * CDDL HEADER END 200Sstevel@tonic-gate */ 210Sstevel@tonic-gate 220Sstevel@tonic-gate/* 23*10024Sbostrovs * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 246812Sraf * Use is subject to license terms. 256812Sraf */ 266812Sraf 276812Sraf/* 286320Sbholler * Copyright (c) 2008, Intel Corporation 290Sstevel@tonic-gate * All rights reserved. 300Sstevel@tonic-gate */ 310Sstevel@tonic-gate 326320Sbholler/* 336320Sbholler * memcpy.s - copies two blocks of memory 346320Sbholler * Implements memcpy() and memmove() libc primitives. 356320Sbholler */ 366812Sraf 377298SMark.J.Nelson@Sun.COM .file "memcpy.s" 380Sstevel@tonic-gate 390Sstevel@tonic-gate#include <sys/asm_linkage.h> 406812Sraf 410Sstevel@tonic-gate ANSI_PRAGMA_WEAK(memmove,function) 420Sstevel@tonic-gate ANSI_PRAGMA_WEAK(memcpy,function) 430Sstevel@tonic-gate 440Sstevel@tonic-gate#include "cache.h" 456320Sbholler#include "proc64_id.h" 460Sstevel@tonic-gate 476320Sbholler#define L(s) .memcpy/**/s 486320Sbholler 496320Sbholler/* 506320Sbholler * memcpy algorithm overview: 516320Sbholler * 526320Sbholler * Thresholds used below were determined experimentally. 536320Sbholler * 546320Sbholler * Pseudo code: 556320Sbholler * 56*10024Sbostrovs * NOTE: On AMD NO_SSE is always set. Performance on Opteron did not improve 57*10024Sbostrovs * using 16-byte stores. Setting NO_SSE on AMD should be re-evaluated on 58*10024Sbostrovs * future AMD processors. 59*10024Sbostrovs * 60*10024Sbostrovs * 616320Sbholler * If (size <= 128 bytes) { 626320Sbholler * do unrolled code (primarily 8-byte loads/stores) regardless of 636320Sbholler * alignment. 646320Sbholler * } else { 656320Sbholler * Align destination to 16-byte boundary 666320Sbholler * 676320Sbholler * if (NO_SSE) { 686320Sbholler * If (size > half of the largest level cache) { 696320Sbholler * Use 8-byte non-temporal stores (64-bytes/loop) 706320Sbholler * } else { 716320Sbholler * if (size > 4K && size <= half l1 cache size) { 726320Sbholler * Use rep movsq 736320Sbholler * } else { 746320Sbholler * Use 8-byte loads/stores (64 bytes per loop) 756320Sbholler * } 766320Sbholler * } 776320Sbholler * 786320Sbholler * } else { **USE SSE** 796320Sbholler * If (size > half of the largest level cache) { 806320Sbholler * Use 16-byte non-temporal stores (128-bytes per loop) 816320Sbholler * } else { 826320Sbholler * If (both source and destination are aligned) { 836320Sbholler * Use 16-byte aligned loads and stores (128 bytes/loop) 846320Sbholler * } else { 856320Sbholler * use pairs of xmm registers with SSE2 or SSSE3 866320Sbholler * instructions to concatenate and shift appropriately 876320Sbholler * to account for source unalignment. This enables 886320Sbholler * 16-byte aligned loads to be done. 896320Sbholler * } 906320Sbholler * } 916320Sbholler } 926320Sbholler * 936320Sbholler * Finish any remaining bytes via unrolled code above. 946320Sbholler * } 956320Sbholler * 966320Sbholler * memmove overview: 976320Sbholler * memmove is the same as memcpy except one case where copy needs to be 986320Sbholler * done backwards. The copy backwards code is done in a similar manner. 996320Sbholler */ 1006320Sbholler 1016320Sbholler ENTRY(memmove) 1026320Sbholler cmp %rsi,%rdi # if dst <= src 1036320Sbholler jbe L(CopyForward) # then do copy forward 1046320Sbholler mov %rsi,%r9 # move src to r9 1056320Sbholler add %rdx,%r9 # add len to get addr of end of src 1066320Sbholler cmp %r9,%rdi # if dst < end of src 1076320Sbholler jb L(CopyBackwards) # then do copy backwards 1086320Sbholler jmp L(CopyForward) 1096320Sbholler 1106320Sbholler ENTRY (memcpy) 1116320SbhollerL(CopyForward): 1126320Sbholler mov %rdx,%r8 1136320Sbholler mov %rdi,%rcx 1146320Sbholler mov %rsi,%rdx 1156320Sbholler mov %rdi,%rax 1166320Sbholler lea L(fwdPxQx)(%rip),%r11 1176320Sbholler cmp $0x80,%r8 # 128 1186320Sbholler jg L(ck_use_sse2) 1196320Sbholler add %r8,%rcx 1206320Sbholler add %r8,%rdx 1216320Sbholler 1226320Sbholler movslq (%r11,%r8,4),%r10 1236320Sbholler lea (%r10,%r11,1),%r11 1246320Sbholler jmpq *%r11 1256320Sbholler 1266320Sbholler .balign 16 1276320SbhollerL(ShrtAlignNew): 1286320Sbholler lea L(AliPxQx)(%rip),%r11 1296320Sbholler mov %rcx,%r9 1306320Sbholler and $0xf,%r9 1316320Sbholler 1326320Sbholler movslq (%r11,%r9,4),%r10 1336320Sbholler lea (%r10,%r11,1),%r11 1346320Sbholler jmpq *%r11 1356320Sbholler 1366320Sbholler .balign 16 1376320SbhollerL(fwdPxQx): .int L(P0Q0)-L(fwdPxQx) 1386320Sbholler .int L(P1Q0)-L(fwdPxQx) 1396320Sbholler .int L(P2Q0)-L(fwdPxQx) 1406320Sbholler .int L(P3Q0)-L(fwdPxQx) 1416320Sbholler .int L(P4Q0)-L(fwdPxQx) 1426320Sbholler .int L(P5Q0)-L(fwdPxQx) 1436320Sbholler .int L(P6Q0)-L(fwdPxQx) 1446320Sbholler .int L(P7Q0)-L(fwdPxQx) 1456320Sbholler 1466320Sbholler .int L(P0Q1)-L(fwdPxQx) 1476320Sbholler .int L(P1Q1)-L(fwdPxQx) 1486320Sbholler .int L(P2Q1)-L(fwdPxQx) 1496320Sbholler .int L(P3Q1)-L(fwdPxQx) 1506320Sbholler .int L(P4Q1)-L(fwdPxQx) 1516320Sbholler .int L(P5Q1)-L(fwdPxQx) 1526320Sbholler .int L(P6Q1)-L(fwdPxQx) 1536320Sbholler .int L(P7Q1)-L(fwdPxQx) 1546320Sbholler 1556320Sbholler .int L(P0Q2)-L(fwdPxQx) 1566320Sbholler .int L(P1Q2)-L(fwdPxQx) 1576320Sbholler .int L(P2Q2)-L(fwdPxQx) 1586320Sbholler .int L(P3Q2)-L(fwdPxQx) 1596320Sbholler .int L(P4Q2)-L(fwdPxQx) 1606320Sbholler .int L(P5Q2)-L(fwdPxQx) 1616320Sbholler .int L(P6Q2)-L(fwdPxQx) 1626320Sbholler .int L(P7Q2)-L(fwdPxQx) 1636320Sbholler 1646320Sbholler .int L(P0Q3)-L(fwdPxQx) 1656320Sbholler .int L(P1Q3)-L(fwdPxQx) 1666320Sbholler .int L(P2Q3)-L(fwdPxQx) 1676320Sbholler .int L(P3Q3)-L(fwdPxQx) 1686320Sbholler .int L(P4Q3)-L(fwdPxQx) 1696320Sbholler .int L(P5Q3)-L(fwdPxQx) 1706320Sbholler .int L(P6Q3)-L(fwdPxQx) 1716320Sbholler .int L(P7Q3)-L(fwdPxQx) 1726320Sbholler 1736320Sbholler .int L(P0Q4)-L(fwdPxQx) 1746320Sbholler .int L(P1Q4)-L(fwdPxQx) 1756320Sbholler .int L(P2Q4)-L(fwdPxQx) 1766320Sbholler .int L(P3Q4)-L(fwdPxQx) 1776320Sbholler .int L(P4Q4)-L(fwdPxQx) 1786320Sbholler .int L(P5Q4)-L(fwdPxQx) 1796320Sbholler .int L(P6Q4)-L(fwdPxQx) 1806320Sbholler .int L(P7Q4)-L(fwdPxQx) 1816320Sbholler 1826320Sbholler .int L(P0Q5)-L(fwdPxQx) 1836320Sbholler .int L(P1Q5)-L(fwdPxQx) 1846320Sbholler .int L(P2Q5)-L(fwdPxQx) 1856320Sbholler .int L(P3Q5)-L(fwdPxQx) 1866320Sbholler .int L(P4Q5)-L(fwdPxQx) 1876320Sbholler .int L(P5Q5)-L(fwdPxQx) 1886320Sbholler .int L(P6Q5)-L(fwdPxQx) 1896320Sbholler .int L(P7Q5)-L(fwdPxQx) 1906320Sbholler 1916320Sbholler .int L(P0Q6)-L(fwdPxQx) 1926320Sbholler .int L(P1Q6)-L(fwdPxQx) 1936320Sbholler .int L(P2Q6)-L(fwdPxQx) 1946320Sbholler .int L(P3Q6)-L(fwdPxQx) 1956320Sbholler .int L(P4Q6)-L(fwdPxQx) 1966320Sbholler .int L(P5Q6)-L(fwdPxQx) 1976320Sbholler .int L(P6Q6)-L(fwdPxQx) 1986320Sbholler .int L(P7Q6)-L(fwdPxQx) 1996320Sbholler 2006320Sbholler .int L(P0Q7)-L(fwdPxQx) 2016320Sbholler .int L(P1Q7)-L(fwdPxQx) 2026320Sbholler .int L(P2Q7)-L(fwdPxQx) 2036320Sbholler .int L(P3Q7)-L(fwdPxQx) 2046320Sbholler .int L(P4Q7)-L(fwdPxQx) 2056320Sbholler .int L(P5Q7)-L(fwdPxQx) 2066320Sbholler .int L(P6Q7)-L(fwdPxQx) 2076320Sbholler .int L(P7Q7)-L(fwdPxQx) 2086320Sbholler 2096320Sbholler .int L(P0Q8)-L(fwdPxQx) 2106320Sbholler .int L(P1Q8)-L(fwdPxQx) 2116320Sbholler .int L(P2Q8)-L(fwdPxQx) 2126320Sbholler .int L(P3Q8)-L(fwdPxQx) 2136320Sbholler .int L(P4Q8)-L(fwdPxQx) 2146320Sbholler .int L(P5Q8)-L(fwdPxQx) 2156320Sbholler .int L(P6Q8)-L(fwdPxQx) 2166320Sbholler .int L(P7Q8)-L(fwdPxQx) 2176320Sbholler 2186320Sbholler .int L(P0Q9)-L(fwdPxQx) 2196320Sbholler .int L(P1Q9)-L(fwdPxQx) 2206320Sbholler .int L(P2Q9)-L(fwdPxQx) 2216320Sbholler .int L(P3Q9)-L(fwdPxQx) 2226320Sbholler .int L(P4Q9)-L(fwdPxQx) 2236320Sbholler .int L(P5Q9)-L(fwdPxQx) 2246320Sbholler .int L(P6Q9)-L(fwdPxQx) 2256320Sbholler .int L(P7Q9)-L(fwdPxQx) 2266320Sbholler 2276320Sbholler .int L(P0QA)-L(fwdPxQx) 2286320Sbholler .int L(P1QA)-L(fwdPxQx) 2296320Sbholler .int L(P2QA)-L(fwdPxQx) 2306320Sbholler .int L(P3QA)-L(fwdPxQx) 2316320Sbholler .int L(P4QA)-L(fwdPxQx) 2326320Sbholler .int L(P5QA)-L(fwdPxQx) 2336320Sbholler .int L(P6QA)-L(fwdPxQx) 2346320Sbholler .int L(P7QA)-L(fwdPxQx) 2356320Sbholler 2366320Sbholler .int L(P0QB)-L(fwdPxQx) 2376320Sbholler .int L(P1QB)-L(fwdPxQx) 2386320Sbholler .int L(P2QB)-L(fwdPxQx) 2396320Sbholler .int L(P3QB)-L(fwdPxQx) 2406320Sbholler .int L(P4QB)-L(fwdPxQx) 2416320Sbholler .int L(P5QB)-L(fwdPxQx) 2426320Sbholler .int L(P6QB)-L(fwdPxQx) 2436320Sbholler .int L(P7QB)-L(fwdPxQx) 2446320Sbholler 2456320Sbholler .int L(P0QC)-L(fwdPxQx) 2466320Sbholler .int L(P1QC)-L(fwdPxQx) 2476320Sbholler .int L(P2QC)-L(fwdPxQx) 2486320Sbholler .int L(P3QC)-L(fwdPxQx) 2496320Sbholler .int L(P4QC)-L(fwdPxQx) 2506320Sbholler .int L(P5QC)-L(fwdPxQx) 2516320Sbholler .int L(P6QC)-L(fwdPxQx) 2526320Sbholler .int L(P7QC)-L(fwdPxQx) 2536320Sbholler 2546320Sbholler .int L(P0QD)-L(fwdPxQx) 2556320Sbholler .int L(P1QD)-L(fwdPxQx) 2566320Sbholler .int L(P2QD)-L(fwdPxQx) 2576320Sbholler .int L(P3QD)-L(fwdPxQx) 2586320Sbholler .int L(P4QD)-L(fwdPxQx) 2596320Sbholler .int L(P5QD)-L(fwdPxQx) 2606320Sbholler .int L(P6QD)-L(fwdPxQx) 2616320Sbholler .int L(P7QD)-L(fwdPxQx) 2626320Sbholler 2636320Sbholler .int L(P0QE)-L(fwdPxQx) 2646320Sbholler .int L(P1QE)-L(fwdPxQx) 2656320Sbholler .int L(P2QE)-L(fwdPxQx) 2666320Sbholler .int L(P3QE)-L(fwdPxQx) 2676320Sbholler .int L(P4QE)-L(fwdPxQx) 2686320Sbholler .int L(P5QE)-L(fwdPxQx) 2696320Sbholler .int L(P6QE)-L(fwdPxQx) 2706320Sbholler .int L(P7QE)-L(fwdPxQx) 2716320Sbholler 2726320Sbholler .int L(P0QF)-L(fwdPxQx) 2736320Sbholler .int L(P1QF)-L(fwdPxQx) 2746320Sbholler .int L(P2QF)-L(fwdPxQx) 2756320Sbholler .int L(P3QF)-L(fwdPxQx) 2766320Sbholler .int L(P4QF)-L(fwdPxQx) 2776320Sbholler .int L(P5QF)-L(fwdPxQx) 2786320Sbholler .int L(P6QF)-L(fwdPxQx) 2796320Sbholler .int L(P7QF)-L(fwdPxQx) 2806320Sbholler 2816320Sbholler .int L(P0QG)-L(fwdPxQx) # 0x80 2826320Sbholler 2836320Sbholler .balign 16 2846320SbhollerL(AliPxQx): .int L(now_qw_aligned)-L(AliPxQx) 2856320Sbholler .int L(A1Q0)-L(AliPxQx) 2866320Sbholler .int L(A2Q0)-L(AliPxQx) 2876320Sbholler .int L(A3Q0)-L(AliPxQx) 2886320Sbholler .int L(A4Q0)-L(AliPxQx) 2896320Sbholler .int L(A5Q0)-L(AliPxQx) 2906320Sbholler .int L(A6Q0)-L(AliPxQx) 2916320Sbholler .int L(A7Q0)-L(AliPxQx) 2926320Sbholler .int L(A0Q1)-L(AliPxQx) 2936320Sbholler .int L(A1Q1)-L(AliPxQx) 2946320Sbholler .int L(A2Q1)-L(AliPxQx) 2956320Sbholler .int L(A3Q1)-L(AliPxQx) 2966320Sbholler .int L(A4Q1)-L(AliPxQx) 2976320Sbholler .int L(A5Q1)-L(AliPxQx) 2986320Sbholler .int L(A6Q1)-L(AliPxQx) 2996320Sbholler .int L(A7Q1)-L(AliPxQx) 3006320Sbholler 3016320Sbholler .balign 16 3026320SbhollerL(A1Q0): # ; need to move 8+ 7=1+2+4 bytes 3036320Sbholler movzbq (%rdx),%r11 3046320Sbholler sub $0xf,%r8 3056320Sbholler mov %r11b,(%rcx) 3066320Sbholler 3076320Sbholler movzwq 0x1(%rdx),%r10 3086320Sbholler mov %r10w,0x1(%rcx) 3096320Sbholler 3106320Sbholler mov 0x3(%rdx),%r9d 3116320Sbholler mov %r9d,0x3(%rcx) 3126320Sbholler 3136320Sbholler mov 0x7(%rdx),%r11 3146320Sbholler add $0xf,%rdx 3156320Sbholler mov %r11,0x7(%rcx) 3166320Sbholler 3176320Sbholler add $0xf,%rcx 3186320Sbholler jmp L(now_qw_aligned) 3196320Sbholler 3206320Sbholler .balign 16 3216320SbhollerL(A2Q0): # ; need to move 8+ 6=2+4 bytes 3226320Sbholler movzwq (%rdx),%r10 3236320Sbholler sub $0xe,%r8 3246320Sbholler mov %r10w,(%rcx) 3256320Sbholler 3266320Sbholler mov 0x2(%rdx),%r9d 3276320Sbholler mov %r9d,0x2(%rcx) 3286320Sbholler 3296320Sbholler mov 0x6(%rdx),%r11 3306320Sbholler add $0xe,%rdx 3316320Sbholler mov %r11,0x6(%rcx) 3326320Sbholler add $0xe,%rcx 3336320Sbholler jmp L(now_qw_aligned) 3346320Sbholler 3356320Sbholler .balign 16 3366320SbhollerL(A3Q0): # ; need to move 8+ 5=1+4 bytes 3376320Sbholler movzbq (%rdx),%r11 3386320Sbholler sub $0xd,%r8 3396320Sbholler mov %r11b,(%rcx) 3406320Sbholler 3416320Sbholler mov 0x1(%rdx),%r9d 3426320Sbholler mov %r9d,0x1(%rcx) 3436320Sbholler 3446320Sbholler mov 0x5(%rdx),%r10 3456320Sbholler add $0xd,%rdx 3466320Sbholler mov %r10,0x5(%rcx) 3476320Sbholler 3486320Sbholler add $0xd,%rcx 3496320Sbholler jmp L(now_qw_aligned) 3506320Sbholler 3516320Sbholler .balign 16 3526320SbhollerL(A4Q0): # ; need to move 8+4 bytes 3536320Sbholler mov (%rdx),%r9d 3546320Sbholler sub $0xc,%r8 3556320Sbholler mov %r9d,(%rcx) 3566320Sbholler 3576320Sbholler mov 0x4(%rdx),%r10 3586320Sbholler add $0xc,%rdx 3596320Sbholler mov %r10,0x4(%rcx) 3606320Sbholler 3616320Sbholler add $0xc,%rcx 3626320Sbholler jmp L(now_qw_aligned) 3636320Sbholler 3646320Sbholler .balign 16 3656320SbhollerL(A5Q0): # ; need to move 8+ 3=1+2 bytes 3666320Sbholler movzbq (%rdx),%r11 3676320Sbholler sub $0xb,%r8 3686320Sbholler mov %r11b,(%rcx) 3696320Sbholler 3706320Sbholler movzwq 0x1(%rdx),%r10 3716320Sbholler mov %r10w,0x1(%rcx) 3726320Sbholler 3736320Sbholler mov 0x3(%rdx),%r9 3746320Sbholler add $0xb,%rdx 3756320Sbholler mov %r9,0x3(%rcx) 3766320Sbholler 3776320Sbholler add $0xb,%rcx 3786320Sbholler jmp L(now_qw_aligned) 3796320Sbholler 3806320Sbholler .balign 16 3816320SbhollerL(A6Q0): # ; need to move 8+2 bytes 3826320Sbholler movzwq (%rdx),%r10 3836320Sbholler sub $0xa,%r8 3846320Sbholler mov %r10w,(%rcx) 3856320Sbholler 3866320Sbholler mov 0x2(%rdx),%r9 3876320Sbholler add $0xa,%rdx 3886320Sbholler mov %r9,0x2(%rcx) 3896320Sbholler 3906320Sbholler add $0xa,%rcx 3916320Sbholler jmp L(now_qw_aligned) 3926320Sbholler 3936320Sbholler .balign 16 3946320SbhollerL(A7Q0): # ; need to move 8+1 byte 3956320Sbholler movzbq (%rdx),%r11 3966320Sbholler sub $0x9,%r8 3976320Sbholler mov %r11b,(%rcx) 3986320Sbholler 3996320Sbholler mov 0x1(%rdx),%r10 4006320Sbholler add $0x9,%rdx 4016320Sbholler mov %r10,0x1(%rcx) 4026320Sbholler 4036320Sbholler add $0x9,%rcx 4046320Sbholler jmp L(now_qw_aligned) 4056320Sbholler 4066320Sbholler .balign 16 4076320SbhollerL(A0Q1): # ; need to move 8 bytes 4086320Sbholler 4096320Sbholler mov (%rdx),%r10 4106320Sbholler add $0x8,%rdx 4116320Sbholler sub $0x8,%r8 4126320Sbholler mov %r10,(%rcx) 4136320Sbholler 4146320Sbholler add $0x8,%rcx 4156320Sbholler jmp L(now_qw_aligned) 4166320Sbholler 4176320Sbholler .balign 16 4186320SbhollerL(A1Q1): # ; need to move 7=1+2+4 bytes 4196320Sbholler movzbq (%rdx),%r11 4206320Sbholler sub $0x7,%r8 4216320Sbholler mov %r11b,(%rcx) 4226320Sbholler 4236320Sbholler movzwq 0x1(%rdx),%r10 4246320Sbholler mov %r10w,0x1(%rcx) 4256320Sbholler 4266320Sbholler mov 0x3(%rdx),%r9d 4276320Sbholler add $0x7,%rdx 4286320Sbholler mov %r9d,0x3(%rcx) 4296320Sbholler add $0x7,%rcx 4306320Sbholler jmp L(now_qw_aligned) 4316320Sbholler 4326320Sbholler .balign 16 4336320SbhollerL(A2Q1): # ; need to move 6=2+4 bytes 4346320Sbholler movzwq (%rdx),%r10 4356320Sbholler sub $0x6,%r8 4366320Sbholler mov %r10w,(%rcx) 4376320Sbholler mov 0x2(%rdx),%r9d 4386320Sbholler add $0x6,%rdx 4396320Sbholler mov %r9d,0x2(%rcx) 4406320Sbholler add $0x6,%rcx 4416320Sbholler jmp L(now_qw_aligned) 4426320Sbholler 4436320Sbholler .balign 16 4446320SbhollerL(A3Q1): # ; need to move 5=1+4 bytes 4456320Sbholler movzbq (%rdx),%r11 4466320Sbholler sub $0x5,%r8 4476320Sbholler mov %r11b,(%rcx) 4486320Sbholler mov 0x1(%rdx),%r9d 4496320Sbholler add $0x5,%rdx 4506320Sbholler mov %r9d,0x1(%rcx) 4516320Sbholler add $0x5,%rcx 4526320Sbholler jmp L(now_qw_aligned) 4536320Sbholler 4546320Sbholler .balign 16 4556320SbhollerL(A4Q1): # ; need to move 4 bytes 4566320Sbholler mov (%rdx),%r9d 4576320Sbholler sub $0x4,%r8 4586320Sbholler add $0x4,%rdx 4596320Sbholler mov %r9d,(%rcx) 4606320Sbholler add $0x4,%rcx 4616320Sbholler jmp L(now_qw_aligned) 4626320Sbholler 4636320Sbholler .balign 16 4646320SbhollerL(A5Q1): # ; need to move 3=1+2 bytes 4656320Sbholler movzbq (%rdx),%r11 4666320Sbholler sub $0x3,%r8 4676320Sbholler mov %r11b,(%rcx) 4686320Sbholler 4696320Sbholler movzwq 0x1(%rdx),%r10 4706320Sbholler add $0x3,%rdx 4716320Sbholler mov %r10w,0x1(%rcx) 4726320Sbholler 4736320Sbholler add $0x3,%rcx 4746320Sbholler jmp L(now_qw_aligned) 4756320Sbholler 4766320Sbholler .balign 16 4776320SbhollerL(A6Q1): # ; need to move 2 bytes 4786320Sbholler movzwq (%rdx),%r10 4796320Sbholler sub $0x2,%r8 4806320Sbholler add $0x2,%rdx 4816320Sbholler mov %r10w,(%rcx) 4826320Sbholler add $0x2,%rcx 4836320Sbholler jmp L(now_qw_aligned) 4846320Sbholler 4856320Sbholler .balign 16 4866320SbhollerL(A7Q1): # ; need to move 1 byte 4876320Sbholler movzbq (%rdx),%r11 4886320Sbholler dec %r8 4896320Sbholler inc %rdx 4906320Sbholler mov %r11b,(%rcx) 4916320Sbholler inc %rcx 4926320Sbholler jmp L(now_qw_aligned) 4936320Sbholler 4946320Sbholler 4956320Sbholler .balign 16 4966320SbhollerL(P0QG): 4976320Sbholler mov -0x80(%rdx),%r9 4986320Sbholler mov %r9,-0x80(%rcx) 4996320SbhollerL(P0QF): 5006320Sbholler mov -0x78(%rdx),%r10 5016320Sbholler mov %r10,-0x78(%rcx) 5026320SbhollerL(P0QE): 5036320Sbholler mov -0x70(%rdx),%r9 5046320Sbholler mov %r9,-0x70(%rcx) 5056320SbhollerL(P0QD): 5066320Sbholler mov -0x68(%rdx),%r10 5076320Sbholler mov %r10,-0x68(%rcx) 5086320SbhollerL(P0QC): 5096320Sbholler mov -0x60(%rdx),%r9 5106320Sbholler mov %r9,-0x60(%rcx) 5116320SbhollerL(P0QB): 5126320Sbholler mov -0x58(%rdx),%r10 5136320Sbholler mov %r10,-0x58(%rcx) 5146320SbhollerL(P0QA): 5156320Sbholler mov -0x50(%rdx),%r9 5166320Sbholler mov %r9,-0x50(%rcx) 5176320SbhollerL(P0Q9): 5186320Sbholler mov -0x48(%rdx),%r10 5196320Sbholler mov %r10,-0x48(%rcx) 5206320SbhollerL(P0Q8): 5216320Sbholler mov -0x40(%rdx),%r9 5226320Sbholler mov %r9,-0x40(%rcx) 5236320SbhollerL(P0Q7): 5246320Sbholler mov -0x38(%rdx),%r10 5256320Sbholler mov %r10,-0x38(%rcx) 5266320SbhollerL(P0Q6): 5276320Sbholler mov -0x30(%rdx),%r9 5286320Sbholler mov %r9,-0x30(%rcx) 5296320SbhollerL(P0Q5): 5306320Sbholler mov -0x28(%rdx),%r10 5316320Sbholler mov %r10,-0x28(%rcx) 5326320SbhollerL(P0Q4): 5336320Sbholler mov -0x20(%rdx),%r9 5346320Sbholler mov %r9,-0x20(%rcx) 5356320SbhollerL(P0Q3): 5366320Sbholler mov -0x18(%rdx),%r10 5376320Sbholler mov %r10,-0x18(%rcx) 5386320SbhollerL(P0Q2): 5396320Sbholler mov -0x10(%rdx),%r9 5406320Sbholler mov %r9,-0x10(%rcx) 5416320SbhollerL(P0Q1): 5426320Sbholler mov -0x8(%rdx),%r10 5436320Sbholler mov %r10,-0x8(%rcx) 5446320SbhollerL(P0Q0): 5456320Sbholler ret 5466320Sbholler 5476320Sbholler .balign 16 5486320SbhollerL(P1QF): 5496320Sbholler mov -0x79(%rdx),%r9 5506320Sbholler mov %r9,-0x79(%rcx) 5516320SbhollerL(P1QE): 5526320Sbholler mov -0x71(%rdx),%r11 5536320Sbholler mov %r11,-0x71(%rcx) 5546320SbhollerL(P1QD): 5556320Sbholler mov -0x69(%rdx),%r10 5566320Sbholler mov %r10,-0x69(%rcx) 5576320SbhollerL(P1QC): 5586320Sbholler mov -0x61(%rdx),%r9 5596320Sbholler mov %r9,-0x61(%rcx) 5606320SbhollerL(P1QB): 5616320Sbholler mov -0x59(%rdx),%r11 5626320Sbholler mov %r11,-0x59(%rcx) 5636320SbhollerL(P1QA): 5646320Sbholler mov -0x51(%rdx),%r10 5656320Sbholler mov %r10,-0x51(%rcx) 5666320SbhollerL(P1Q9): 5676320Sbholler mov -0x49(%rdx),%r9 5686320Sbholler mov %r9,-0x49(%rcx) 5696320SbhollerL(P1Q8): 5706320Sbholler mov -0x41(%rdx),%r11 5716320Sbholler mov %r11,-0x41(%rcx) 5726320SbhollerL(P1Q7): 5736320Sbholler mov -0x39(%rdx),%r10 5746320Sbholler mov %r10,-0x39(%rcx) 5756320SbhollerL(P1Q6): 5766320Sbholler mov -0x31(%rdx),%r9 5776320Sbholler mov %r9,-0x31(%rcx) 5786320SbhollerL(P1Q5): 5796320Sbholler mov -0x29(%rdx),%r11 5806320Sbholler mov %r11,-0x29(%rcx) 5816320SbhollerL(P1Q4): 5826320Sbholler mov -0x21(%rdx),%r10 5836320Sbholler mov %r10,-0x21(%rcx) 5846320SbhollerL(P1Q3): 5856320Sbholler mov -0x19(%rdx),%r9 5866320Sbholler mov %r9,-0x19(%rcx) 5876320SbhollerL(P1Q2): 5886320Sbholler mov -0x11(%rdx),%r11 5896320Sbholler mov %r11,-0x11(%rcx) 5906320SbhollerL(P1Q1): 5916320Sbholler mov -0x9(%rdx),%r10 5926320Sbholler mov %r10,-0x9(%rcx) 5936320SbhollerL(P1Q0): 5946320Sbholler movzbq -0x1(%rdx),%r9 5956320Sbholler mov %r9b,-0x1(%rcx) 5966320Sbholler ret 5976320Sbholler 5986320Sbholler .balign 16 5996320SbhollerL(P2QF): 6006320Sbholler mov -0x7a(%rdx),%r9 6016320Sbholler mov %r9,-0x7a(%rcx) 6026320SbhollerL(P2QE): 6036320Sbholler mov -0x72(%rdx),%r11 6046320Sbholler mov %r11,-0x72(%rcx) 6056320SbhollerL(P2QD): 6066320Sbholler mov -0x6a(%rdx),%r10 6076320Sbholler mov %r10,-0x6a(%rcx) 6086320SbhollerL(P2QC): 6096320Sbholler mov -0x62(%rdx),%r9 6106320Sbholler mov %r9,-0x62(%rcx) 6116320SbhollerL(P2QB): 6126320Sbholler mov -0x5a(%rdx),%r11 6136320Sbholler mov %r11,-0x5a(%rcx) 6146320SbhollerL(P2QA): 6156320Sbholler mov -0x52(%rdx),%r10 6166320Sbholler mov %r10,-0x52(%rcx) 6176320SbhollerL(P2Q9): 6186320Sbholler mov -0x4a(%rdx),%r9 6196320Sbholler mov %r9,-0x4a(%rcx) 6206320SbhollerL(P2Q8): 6216320Sbholler mov -0x42(%rdx),%r11 6226320Sbholler mov %r11,-0x42(%rcx) 6236320SbhollerL(P2Q7): 6246320Sbholler mov -0x3a(%rdx),%r10 6256320Sbholler mov %r10,-0x3a(%rcx) 6266320SbhollerL(P2Q6): 6276320Sbholler mov -0x32(%rdx),%r9 6286320Sbholler mov %r9,-0x32(%rcx) 6296320SbhollerL(P2Q5): 6306320Sbholler mov -0x2a(%rdx),%r11 6316320Sbholler mov %r11,-0x2a(%rcx) 6326320SbhollerL(P2Q4): 6336320Sbholler mov -0x22(%rdx),%r10 6346320Sbholler mov %r10,-0x22(%rcx) 6356320SbhollerL(P2Q3): 6366320Sbholler mov -0x1a(%rdx),%r9 6376320Sbholler mov %r9,-0x1a(%rcx) 6386320SbhollerL(P2Q2): 6396320Sbholler mov -0x12(%rdx),%r11 6406320Sbholler mov %r11,-0x12(%rcx) 6416320SbhollerL(P2Q1): 6426320Sbholler mov -0xa(%rdx),%r10 6436320Sbholler mov %r10,-0xa(%rcx) 6446320SbhollerL(P2Q0): 6456320Sbholler movzwq -0x2(%rdx),%r9 6466320Sbholler mov %r9w,-0x2(%rcx) 6476320Sbholler ret 6486320Sbholler 6496320Sbholler .balign 16 6506320SbhollerL(P3QF): 6516320Sbholler mov -0x7b(%rdx),%r9 6526320Sbholler mov %r9,-0x7b(%rcx) 6536320SbhollerL(P3QE): 6546320Sbholler mov -0x73(%rdx),%r11 6556320Sbholler mov %r11,-0x73(%rcx) 6566320SbhollerL(P3QD): 6576320Sbholler mov -0x6b(%rdx),%r10 6586320Sbholler mov %r10,-0x6b(%rcx) 6596320SbhollerL(P3QC): 6606320Sbholler mov -0x63(%rdx),%r9 6616320Sbholler mov %r9,-0x63(%rcx) 6626320SbhollerL(P3QB): 6636320Sbholler mov -0x5b(%rdx),%r11 6646320Sbholler mov %r11,-0x5b(%rcx) 6656320SbhollerL(P3QA): 6666320Sbholler mov -0x53(%rdx),%r10 6676320Sbholler mov %r10,-0x53(%rcx) 6686320SbhollerL(P3Q9): 6696320Sbholler mov -0x4b(%rdx),%r9 6706320Sbholler mov %r9,-0x4b(%rcx) 6716320SbhollerL(P3Q8): 6726320Sbholler mov -0x43(%rdx),%r11 6736320Sbholler mov %r11,-0x43(%rcx) 6746320SbhollerL(P3Q7): 6756320Sbholler mov -0x3b(%rdx),%r10 6766320Sbholler mov %r10,-0x3b(%rcx) 6776320SbhollerL(P3Q6): 6786320Sbholler mov -0x33(%rdx),%r9 6796320Sbholler mov %r9,-0x33(%rcx) 6806320SbhollerL(P3Q5): 6816320Sbholler mov -0x2b(%rdx),%r11 6826320Sbholler mov %r11,-0x2b(%rcx) 6836320SbhollerL(P3Q4): 6846320Sbholler mov -0x23(%rdx),%r10 6856320Sbholler mov %r10,-0x23(%rcx) 6866320SbhollerL(P3Q3): 6876320Sbholler mov -0x1b(%rdx),%r9 6886320Sbholler mov %r9,-0x1b(%rcx) 6896320SbhollerL(P3Q2): 6906320Sbholler mov -0x13(%rdx),%r11 6916320Sbholler mov %r11,-0x13(%rcx) 6926320SbhollerL(P3Q1): 6936320Sbholler mov -0xb(%rdx),%r10 6946320Sbholler mov %r10,-0xb(%rcx) 6956320Sbholler /* 6966320Sbholler * These trailing loads/stores have to do all their loads 1st, 6976320Sbholler * then do the stores. 6986320Sbholler */ 6996320SbhollerL(P3Q0): 7006320Sbholler movzwq -0x3(%rdx),%r9 7016320Sbholler movzbq -0x1(%rdx),%r10 7026320Sbholler mov %r9w,-0x3(%rcx) 7036320Sbholler mov %r10b,-0x1(%rcx) 7046320Sbholler ret 7056320Sbholler 7066320Sbholler .balign 16 7076320SbhollerL(P4QF): 7086320Sbholler mov -0x7c(%rdx),%r9 7096320Sbholler mov %r9,-0x7c(%rcx) 7106320SbhollerL(P4QE): 7116320Sbholler mov -0x74(%rdx),%r11 7126320Sbholler mov %r11,-0x74(%rcx) 7136320SbhollerL(P4QD): 7146320Sbholler mov -0x6c(%rdx),%r10 7156320Sbholler mov %r10,-0x6c(%rcx) 7166320SbhollerL(P4QC): 7176320Sbholler mov -0x64(%rdx),%r9 7186320Sbholler mov %r9,-0x64(%rcx) 7196320SbhollerL(P4QB): 7206320Sbholler mov -0x5c(%rdx),%r11 7216320Sbholler mov %r11,-0x5c(%rcx) 7226320SbhollerL(P4QA): 7236320Sbholler mov -0x54(%rdx),%r10 7246320Sbholler mov %r10,-0x54(%rcx) 7256320SbhollerL(P4Q9): 7266320Sbholler mov -0x4c(%rdx),%r9 7276320Sbholler mov %r9,-0x4c(%rcx) 7286320SbhollerL(P4Q8): 7296320Sbholler mov -0x44(%rdx),%r11 7306320Sbholler mov %r11,-0x44(%rcx) 7316320SbhollerL(P4Q7): 7326320Sbholler mov -0x3c(%rdx),%r10 7336320Sbholler mov %r10,-0x3c(%rcx) 7346320SbhollerL(P4Q6): 7356320Sbholler mov -0x34(%rdx),%r9 7366320Sbholler mov %r9,-0x34(%rcx) 7376320SbhollerL(P4Q5): 7386320Sbholler mov -0x2c(%rdx),%r11 7396320Sbholler mov %r11,-0x2c(%rcx) 7406320SbhollerL(P4Q4): 7416320Sbholler mov -0x24(%rdx),%r10 7426320Sbholler mov %r10,-0x24(%rcx) 7436320SbhollerL(P4Q3): 7446320Sbholler mov -0x1c(%rdx),%r9 7456320Sbholler mov %r9,-0x1c(%rcx) 7466320SbhollerL(P4Q2): 7476320Sbholler mov -0x14(%rdx),%r11 7486320Sbholler mov %r11,-0x14(%rcx) 7496320SbhollerL(P4Q1): 7506320Sbholler mov -0xc(%rdx),%r10 7516320Sbholler mov %r10,-0xc(%rcx) 7526320SbhollerL(P4Q0): 7536320Sbholler mov -0x4(%rdx),%r9d 7546320Sbholler mov %r9d,-0x4(%rcx) 7556320Sbholler ret 7566320Sbholler 7576320Sbholler .balign 16 7586320SbhollerL(P5QF): 7596320Sbholler mov -0x7d(%rdx),%r9 7606320Sbholler mov %r9,-0x7d(%rcx) 7616320SbhollerL(P5QE): 7626320Sbholler mov -0x75(%rdx),%r11 7636320Sbholler mov %r11,-0x75(%rcx) 7646320SbhollerL(P5QD): 7656320Sbholler mov -0x6d(%rdx),%r10 7666320Sbholler mov %r10,-0x6d(%rcx) 7676320SbhollerL(P5QC): 7686320Sbholler mov -0x65(%rdx),%r9 7696320Sbholler mov %r9,-0x65(%rcx) 7706320SbhollerL(P5QB): 7716320Sbholler mov -0x5d(%rdx),%r11 7726320Sbholler mov %r11,-0x5d(%rcx) 7736320SbhollerL(P5QA): 7746320Sbholler mov -0x55(%rdx),%r10 7756320Sbholler mov %r10,-0x55(%rcx) 7766320SbhollerL(P5Q9): 7776320Sbholler mov -0x4d(%rdx),%r9 7786320Sbholler mov %r9,-0x4d(%rcx) 7796320SbhollerL(P5Q8): 7806320Sbholler mov -0x45(%rdx),%r11 7816320Sbholler mov %r11,-0x45(%rcx) 7826320SbhollerL(P5Q7): 7836320Sbholler mov -0x3d(%rdx),%r10 7846320Sbholler mov %r10,-0x3d(%rcx) 7856320SbhollerL(P5Q6): 7866320Sbholler mov -0x35(%rdx),%r9 7876320Sbholler mov %r9,-0x35(%rcx) 7886320SbhollerL(P5Q5): 7896320Sbholler mov -0x2d(%rdx),%r11 7906320Sbholler mov %r11,-0x2d(%rcx) 7916320SbhollerL(P5Q4): 7926320Sbholler mov -0x25(%rdx),%r10 7936320Sbholler mov %r10,-0x25(%rcx) 7946320SbhollerL(P5Q3): 7956320Sbholler mov -0x1d(%rdx),%r9 7966320Sbholler mov %r9,-0x1d(%rcx) 7976320SbhollerL(P5Q2): 7986320Sbholler mov -0x15(%rdx),%r11 7996320Sbholler mov %r11,-0x15(%rcx) 8006320SbhollerL(P5Q1): 8016320Sbholler mov -0xd(%rdx),%r10 8026320Sbholler mov %r10,-0xd(%rcx) 8036320Sbholler /* 8046320Sbholler * These trailing loads/stores have to do all their loads 1st, 8056320Sbholler * then do the stores. 8066320Sbholler */ 8076320SbhollerL(P5Q0): 8086320Sbholler mov -0x5(%rdx),%r9d 8096320Sbholler movzbq -0x1(%rdx),%r10 8106320Sbholler mov %r9d,-0x5(%rcx) 8116320Sbholler mov %r10b,-0x1(%rcx) 8126320Sbholler ret 8136320Sbholler 8146320Sbholler .balign 16 8156320SbhollerL(P6QF): 8166320Sbholler mov -0x7e(%rdx),%r9 8176320Sbholler mov %r9,-0x7e(%rcx) 8186320SbhollerL(P6QE): 8196320Sbholler mov -0x76(%rdx),%r11 8206320Sbholler mov %r11,-0x76(%rcx) 8216320SbhollerL(P6QD): 8226320Sbholler mov -0x6e(%rdx),%r10 8236320Sbholler mov %r10,-0x6e(%rcx) 8246320SbhollerL(P6QC): 8256320Sbholler mov -0x66(%rdx),%r9 8266320Sbholler mov %r9,-0x66(%rcx) 8276320SbhollerL(P6QB): 8286320Sbholler mov -0x5e(%rdx),%r11 8296320Sbholler mov %r11,-0x5e(%rcx) 8306320SbhollerL(P6QA): 8316320Sbholler mov -0x56(%rdx),%r10 8326320Sbholler mov %r10,-0x56(%rcx) 8336320SbhollerL(P6Q9): 8346320Sbholler mov -0x4e(%rdx),%r9 8356320Sbholler mov %r9,-0x4e(%rcx) 8366320SbhollerL(P6Q8): 8376320Sbholler mov -0x46(%rdx),%r11 8386320Sbholler mov %r11,-0x46(%rcx) 8396320SbhollerL(P6Q7): 8406320Sbholler mov -0x3e(%rdx),%r10 8416320Sbholler mov %r10,-0x3e(%rcx) 8426320SbhollerL(P6Q6): 8436320Sbholler mov -0x36(%rdx),%r9 8446320Sbholler mov %r9,-0x36(%rcx) 8456320SbhollerL(P6Q5): 8466320Sbholler mov -0x2e(%rdx),%r11 8476320Sbholler mov %r11,-0x2e(%rcx) 8486320SbhollerL(P6Q4): 8496320Sbholler mov -0x26(%rdx),%r10 8506320Sbholler mov %r10,-0x26(%rcx) 8516320SbhollerL(P6Q3): 8526320Sbholler mov -0x1e(%rdx),%r9 8536320Sbholler mov %r9,-0x1e(%rcx) 8546320SbhollerL(P6Q2): 8556320Sbholler mov -0x16(%rdx),%r11 8566320Sbholler mov %r11,-0x16(%rcx) 8576320SbhollerL(P6Q1): 8586320Sbholler mov -0xe(%rdx),%r10 8596320Sbholler mov %r10,-0xe(%rcx) 8606320Sbholler /* 8616320Sbholler * These trailing loads/stores have to do all their loads 1st, 8626320Sbholler * then do the stores. 8636320Sbholler */ 8646320SbhollerL(P6Q0): 8656320Sbholler mov -0x6(%rdx),%r9d 8666320Sbholler movzwq -0x2(%rdx),%r10 8676320Sbholler mov %r9d,-0x6(%rcx) 8686320Sbholler mov %r10w,-0x2(%rcx) 8696320Sbholler ret 8706320Sbholler 8716320Sbholler .balign 16 8726320SbhollerL(P7QF): 8736320Sbholler mov -0x7f(%rdx),%r9 8746320Sbholler mov %r9,-0x7f(%rcx) 8756320SbhollerL(P7QE): 8766320Sbholler mov -0x77(%rdx),%r11 8776320Sbholler mov %r11,-0x77(%rcx) 8786320SbhollerL(P7QD): 8796320Sbholler mov -0x6f(%rdx),%r10 8806320Sbholler mov %r10,-0x6f(%rcx) 8816320SbhollerL(P7QC): 8826320Sbholler mov -0x67(%rdx),%r9 8836320Sbholler mov %r9,-0x67(%rcx) 8846320SbhollerL(P7QB): 8856320Sbholler mov -0x5f(%rdx),%r11 8866320Sbholler mov %r11,-0x5f(%rcx) 8876320SbhollerL(P7QA): 8886320Sbholler mov -0x57(%rdx),%r10 8896320Sbholler mov %r10,-0x57(%rcx) 8906320SbhollerL(P7Q9): 8916320Sbholler mov -0x4f(%rdx),%r9 8926320Sbholler mov %r9,-0x4f(%rcx) 8936320SbhollerL(P7Q8): 8946320Sbholler mov -0x47(%rdx),%r11 8956320Sbholler mov %r11,-0x47(%rcx) 8966320SbhollerL(P7Q7): 8976320Sbholler mov -0x3f(%rdx),%r10 8986320Sbholler mov %r10,-0x3f(%rcx) 8996320SbhollerL(P7Q6): 9006320Sbholler mov -0x37(%rdx),%r9 9016320Sbholler mov %r9,-0x37(%rcx) 9026320SbhollerL(P7Q5): 9036320Sbholler mov -0x2f(%rdx),%r11 9046320Sbholler mov %r11,-0x2f(%rcx) 9056320SbhollerL(P7Q4): 9066320Sbholler mov -0x27(%rdx),%r10 9076320Sbholler mov %r10,-0x27(%rcx) 9086320SbhollerL(P7Q3): 9096320Sbholler mov -0x1f(%rdx),%r9 9106320Sbholler mov %r9,-0x1f(%rcx) 9116320SbhollerL(P7Q2): 9126320Sbholler mov -0x17(%rdx),%r11 9136320Sbholler mov %r11,-0x17(%rcx) 9146320SbhollerL(P7Q1): 9156320Sbholler mov -0xf(%rdx),%r10 9166320Sbholler mov %r10,-0xf(%rcx) 9176320Sbholler /* 9186320Sbholler * These trailing loads/stores have to do all their loads 1st, 9196320Sbholler * then do the stores. 9206320Sbholler */ 9216320SbhollerL(P7Q0): 9226320Sbholler mov -0x7(%rdx),%r9d 9236320Sbholler movzwq -0x3(%rdx),%r10 9246320Sbholler movzbq -0x1(%rdx),%r11 9256320Sbholler mov %r9d,-0x7(%rcx) 9266320Sbholler mov %r10w,-0x3(%rcx) 9276320Sbholler mov %r11b,-0x1(%rcx) 9286320Sbholler ret 9296320Sbholler 9306320Sbholler .balign 16 9316320SbhollerL(ck_use_sse2): 9326320Sbholler /* 9336320Sbholler * Align dest to 16 byte boundary. 9346320Sbholler */ 9356320Sbholler test $0xf,%rcx 9366320Sbholler jnz L(ShrtAlignNew) 9376320Sbholler 9386320SbhollerL(now_qw_aligned): 9396320Sbholler cmpl $NO_SSE,.memops_method(%rip) 9406320Sbholler je L(Loop8byte_pre) 9416320Sbholler 9426320Sbholler /* 9436320Sbholler * The fall-through path is to do SSE2 16-byte load/stores 9446320Sbholler */ 9456320Sbholler 9466320Sbholler /* 9476320Sbholler * If current move size is larger than half of the highest level cache 9486320Sbholler * size, then do non-temporal moves. 9496320Sbholler */ 9506320Sbholler mov .largest_level_cache_size(%rip),%r9d 9516320Sbholler shr %r9 # take half of it 9526320Sbholler cmp %r9,%r8 9536320Sbholler jg L(sse2_nt_move) 9546320Sbholler 9556320Sbholler /* 9566320Sbholler * If both the source and dest are aligned, then use the both aligned 9576320Sbholler * logic. Well aligned data should reap the rewards. 9586320Sbholler */ 9596320Sbholler test $0xf,%rdx 9606320Sbholler jz L(pre_both_aligned) 9616320Sbholler 9626320Sbholler lea L(SSE_src)(%rip),%r10 # SSE2 (default) 9636320Sbholler testl $USE_SSSE3,.memops_method(%rip) 9646320Sbholler jz 1f 9656320Sbholler lea L(SSSE3_src)(%rip),%r10 # SSSE3 9666320Sbholler 9676320Sbholler1: 9686320Sbholler /* 9696320Sbholler * if the src is not 16 byte aligned... 9706320Sbholler */ 9716320Sbholler mov %rdx,%r11 9726320Sbholler and $0xf,%r11 9736320Sbholler movdqu (%rdx),%xmm0 9746320Sbholler movdqa %xmm0,(%rcx) 9756320Sbholler add $0x10,%rdx 9766320Sbholler sub %r11,%rdx 9776320Sbholler add $0x10,%rcx 9786320Sbholler sub $0x10,%r8 9796320Sbholler movdqa (%rdx),%xmm1 9806320Sbholler 9816320Sbholler movslq (%r10,%r11,4),%r9 9826320Sbholler lea (%r9,%r10,1),%r10 9836320Sbholler jmpq *%r10 9846320Sbholler 9856320Sbholler .balign 16 9866320SbhollerL(SSSE3_src): .int L(pre_both_aligned)-L(SSSE3_src) 9876320Sbholler .int L(mov3dqa1) -L(SSSE3_src) 9886320Sbholler .int L(mov3dqa2) -L(SSSE3_src) 9896320Sbholler .int L(mov3dqa3) -L(SSSE3_src) 9906320Sbholler .int L(mov3dqa4) -L(SSSE3_src) 9916320Sbholler .int L(mov3dqa5) -L(SSSE3_src) 9926320Sbholler .int L(mov3dqa6) -L(SSSE3_src) 9936320Sbholler .int L(mov3dqa7) -L(SSSE3_src) 9946320Sbholler .int L(movdqa8) -L(SSSE3_src) 9956320Sbholler .int L(mov3dqa9) -L(SSSE3_src) 9966320Sbholler .int L(mov3dqa10)-L(SSSE3_src) 9976320Sbholler .int L(mov3dqa11)-L(SSSE3_src) 9986320Sbholler .int L(mov3dqa12)-L(SSSE3_src) 9996320Sbholler .int L(mov3dqa13)-L(SSSE3_src) 10006320Sbholler .int L(mov3dqa14)-L(SSSE3_src) 10016320Sbholler .int L(mov3dqa15)-L(SSSE3_src) 10026320SbhollerL(SSE_src): .int L(pre_both_aligned)-L(SSE_src) 10036320Sbholler .int L(movdqa1) -L(SSE_src) 10046320Sbholler .int L(movdqa2) -L(SSE_src) 10056320Sbholler .int L(movdqa3) -L(SSE_src) 10066320Sbholler .int L(movdqa4) -L(SSE_src) 10076320Sbholler .int L(movdqa5) -L(SSE_src) 10086320Sbholler .int L(movdqa6) -L(SSE_src) 10096320Sbholler .int L(movdqa7) -L(SSE_src) 10106320Sbholler .int L(movdqa8) -L(SSE_src) 10116320Sbholler .int L(movdqa9) -L(SSE_src) 10126320Sbholler .int L(movdqa10)-L(SSE_src) 10136320Sbholler .int L(movdqa11)-L(SSE_src) 10146320Sbholler .int L(movdqa12)-L(SSE_src) 10156320Sbholler .int L(movdqa13)-L(SSE_src) 10166320Sbholler .int L(movdqa14)-L(SSE_src) 10176320Sbholler .int L(movdqa15)-L(SSE_src) 10186320Sbholler 10196320Sbholler .balign 16 10206320SbhollerL(movdqa1): 10216320Sbholler movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 10226320Sbholler movdqa 0x20(%rdx),%xmm0 # load the upper source buffer 10236320Sbholler lea 0x20(%rdx),%rdx 10246320Sbholler lea -0x20(%r8),%r8 10256320Sbholler 10266320Sbholler psrldq $0x1,%xmm1 # shift right prev buffer (saved from last iteration) 10276320Sbholler movdqa %xmm3,%xmm2 # store off xmm reg for use next iteration 10286320Sbholler pslldq $0xf,%xmm3 # shift the current buffer left (shift in zeros) 10296320Sbholler por %xmm1,%xmm3 # OR them together 10306320Sbholler cmp $0x20,%r8 10316320Sbholler 10326320Sbholler psrldq $0x1,%xmm2 # shift right prev buffer (saved from last iteration) 10336320Sbholler movdqa %xmm0,%xmm1 # store off xmm reg for use next iteration 10346320Sbholler pslldq $0xf,%xmm0 # shift the current buffer left (shift in zeros) 10356320Sbholler por %xmm2,%xmm0 # OR them together 10366320Sbholler movdqa %xmm3,(%rcx) # store it 10376320Sbholler movdqa %xmm0,0x10(%rcx) # store it 10386320Sbholler lea 0x20(%rcx),%rcx 10396320Sbholler 10406320Sbholler jge L(movdqa1) 10416320Sbholler jmp L(movdqa_epi) 10426320Sbholler 10436320Sbholler .balign 16 10446320SbhollerL(movdqa2): 10456320Sbholler sub $0x20,%r8 10466320Sbholler movdqa 0x10(%rdx),%xmm3 10476320Sbholler movdqa 0x20(%rdx),%xmm0 10486320Sbholler add $0x20,%rdx 10496320Sbholler 10506320Sbholler psrldq $0x2,%xmm1 10516320Sbholler movdqa %xmm3,%xmm2 10526320Sbholler pslldq $0xe,%xmm3 10536320Sbholler por %xmm1,%xmm3 10546320Sbholler 10556320Sbholler psrldq $0x2,%xmm2 10566320Sbholler movdqa %xmm0,%xmm1 10576320Sbholler pslldq $0xe,%xmm0 10586320Sbholler por %xmm2,%xmm0 10596320Sbholler movdqa %xmm3,(%rcx) 10606320Sbholler movdqa %xmm0,0x10(%rcx) 10616320Sbholler 10626320Sbholler add $0x20,%rcx 10636320Sbholler cmp $0x20,%r8 10646320Sbholler jge L(movdqa2) 10656320Sbholler jmp L(movdqa_epi) 10666320Sbholler 10676320Sbholler .balign 16 10686320SbhollerL(movdqa3): 10696320Sbholler sub $0x20,%r8 10706320Sbholler movdqa 0x10(%rdx),%xmm3 10716320Sbholler movdqa 0x20(%rdx),%xmm0 10726320Sbholler add $0x20,%rdx 10736320Sbholler 10746320Sbholler psrldq $0x3,%xmm1 10756320Sbholler movdqa %xmm3,%xmm2 10766320Sbholler pslldq $0xd,%xmm3 10776320Sbholler por %xmm1,%xmm3 10786320Sbholler 10796320Sbholler psrldq $0x3,%xmm2 10806320Sbholler movdqa %xmm0,%xmm1 10816320Sbholler pslldq $0xd,%xmm0 10826320Sbholler por %xmm2,%xmm0 10836320Sbholler movdqa %xmm3,(%rcx) 10846320Sbholler movdqa %xmm0,0x10(%rcx) 10856320Sbholler 10866320Sbholler add $0x20,%rcx 10876320Sbholler cmp $0x20,%r8 10886320Sbholler jge L(movdqa3) 10896320Sbholler jmp L(movdqa_epi) 10906320Sbholler 10916320Sbholler .balign 16 10926320SbhollerL(movdqa4): 10936320Sbholler sub $0x20,%r8 10946320Sbholler movdqa 0x10(%rdx),%xmm3 10956320Sbholler movdqa 0x20(%rdx),%xmm0 10966320Sbholler add $0x20,%rdx 10976320Sbholler 10986320Sbholler psrldq $0x4,%xmm1 10996320Sbholler movdqa %xmm3,%xmm2 11006320Sbholler pslldq $0xc,%xmm3 11016320Sbholler por %xmm1,%xmm3 11026320Sbholler 11036320Sbholler psrldq $0x4,%xmm2 11046320Sbholler movdqa %xmm0,%xmm1 11056320Sbholler pslldq $0xc,%xmm0 11066320Sbholler por %xmm2,%xmm0 11076320Sbholler 11086320Sbholler movdqa %xmm3,(%rcx) 11096320Sbholler movdqa %xmm0,0x10(%rcx) 11106320Sbholler 11116320Sbholler add $0x20,%rcx 11126320Sbholler cmp $0x20,%r8 11136320Sbholler jge L(movdqa4) 11146320Sbholler jmp L(movdqa_epi) 11156320Sbholler 11166320Sbholler .balign 16 11176320SbhollerL(movdqa5): 11186320Sbholler sub $0x20,%r8 11196320Sbholler movdqa 0x10(%rdx),%xmm3 11206320Sbholler movdqa 0x20(%rdx),%xmm0 11216320Sbholler add $0x20,%rdx 11226320Sbholler 11236320Sbholler psrldq $0x5,%xmm1 11246320Sbholler movdqa %xmm3,%xmm2 11256320Sbholler pslldq $0xb,%xmm3 11266320Sbholler por %xmm1,%xmm3 11276320Sbholler 11286320Sbholler psrldq $0x5,%xmm2 11296320Sbholler movdqa %xmm0,%xmm1 11306320Sbholler pslldq $0xb,%xmm0 11316320Sbholler por %xmm2,%xmm0 11326320Sbholler 11336320Sbholler movdqa %xmm3,(%rcx) 11346320Sbholler movdqa %xmm0,0x10(%rcx) 11356320Sbholler 11366320Sbholler add $0x20,%rcx 11376320Sbholler cmp $0x20,%r8 11386320Sbholler jge L(movdqa5) 11396320Sbholler jmp L(movdqa_epi) 11406320Sbholler 11416320Sbholler .balign 16 11426320SbhollerL(movdqa6): 11436320Sbholler sub $0x20,%r8 11446320Sbholler movdqa 0x10(%rdx),%xmm3 11456320Sbholler movdqa 0x20(%rdx),%xmm0 11466320Sbholler add $0x20,%rdx 11476320Sbholler 11486320Sbholler psrldq $0x6,%xmm1 11496320Sbholler movdqa %xmm3,%xmm2 11506320Sbholler pslldq $0xa,%xmm3 11516320Sbholler por %xmm1,%xmm3 11526320Sbholler 11536320Sbholler psrldq $0x6,%xmm2 11546320Sbholler movdqa %xmm0,%xmm1 11556320Sbholler pslldq $0xa,%xmm0 11566320Sbholler por %xmm2,%xmm0 11576320Sbholler movdqa %xmm3,(%rcx) 11586320Sbholler movdqa %xmm0,0x10(%rcx) 11596320Sbholler 11606320Sbholler add $0x20,%rcx 11616320Sbholler cmp $0x20,%r8 11626320Sbholler jge L(movdqa6) 11636320Sbholler jmp L(movdqa_epi) 11646320Sbholler 11656320Sbholler .balign 16 11666320SbhollerL(movdqa7): 11676320Sbholler sub $0x20,%r8 11686320Sbholler movdqa 0x10(%rdx),%xmm3 11696320Sbholler movdqa 0x20(%rdx),%xmm0 11706320Sbholler add $0x20,%rdx 11716320Sbholler 11726320Sbholler psrldq $0x7,%xmm1 11736320Sbholler movdqa %xmm3,%xmm2 11746320Sbholler pslldq $0x9,%xmm3 11756320Sbholler por %xmm1,%xmm3 11766320Sbholler 11776320Sbholler psrldq $0x7,%xmm2 11786320Sbholler movdqa %xmm0,%xmm1 11796320Sbholler pslldq $0x9,%xmm0 11806320Sbholler por %xmm2,%xmm0 11816320Sbholler movdqa %xmm3,(%rcx) 11826320Sbholler movdqa %xmm0,0x10(%rcx) 11836320Sbholler 11846320Sbholler add $0x20,%rcx 11856320Sbholler cmp $0x20,%r8 11866320Sbholler jge L(movdqa7) 11876320Sbholler jmp L(movdqa_epi) 11886320Sbholler 11896320Sbholler .balign 16 11906320SbhollerL(movdqa8): 11916320Sbholler movdqa 0x10(%rdx),%xmm3 11926320Sbholler sub $0x30,%r8 11936320Sbholler movdqa 0x20(%rdx),%xmm0 11946320Sbholler movdqa 0x30(%rdx),%xmm5 11956320Sbholler lea 0x30(%rdx),%rdx 11966320Sbholler 11976320Sbholler shufpd $0x1,%xmm3,%xmm1 11986320Sbholler movdqa %xmm1,(%rcx) 11996320Sbholler 12006320Sbholler cmp $0x30,%r8 12016320Sbholler 12026320Sbholler shufpd $0x1,%xmm0,%xmm3 12036320Sbholler movdqa %xmm3,0x10(%rcx) 12046320Sbholler 12056320Sbholler movdqa %xmm5,%xmm1 12066320Sbholler shufpd $0x1,%xmm5,%xmm0 12076320Sbholler movdqa %xmm0,0x20(%rcx) 12086320Sbholler 12096320Sbholler lea 0x30(%rcx),%rcx 12106320Sbholler 12116320Sbholler jge L(movdqa8) 12126320Sbholler jmp L(movdqa_epi) 12136320Sbholler 12146320Sbholler .balign 16 12156320SbhollerL(movdqa9): 12166320Sbholler sub $0x20,%r8 12176320Sbholler movdqa 0x10(%rdx),%xmm3 12186320Sbholler movdqa 0x20(%rdx),%xmm0 12196320Sbholler add $0x20,%rdx 12206320Sbholler 12216320Sbholler psrldq $0x9,%xmm1 12226320Sbholler movdqa %xmm3,%xmm2 12236320Sbholler pslldq $0x7,%xmm3 12246320Sbholler por %xmm1,%xmm3 12256320Sbholler 12266320Sbholler psrldq $0x9,%xmm2 12276320Sbholler movdqa %xmm0,%xmm1 12286320Sbholler pslldq $0x7,%xmm0 12296320Sbholler por %xmm2,%xmm0 12306320Sbholler movdqa %xmm3,(%rcx) 12316320Sbholler movdqa %xmm0,0x10(%rcx) 12326320Sbholler 12336320Sbholler add $0x20,%rcx 12346320Sbholler cmp $0x20,%r8 12356320Sbholler jge L(movdqa9) 12366320Sbholler jmp L(movdqa_epi) 12376320Sbholler 12386320Sbholler .balign 16 12396320SbhollerL(movdqa10): 12406320Sbholler sub $0x20,%r8 12416320Sbholler movdqa 0x10(%rdx),%xmm3 12426320Sbholler movdqa 0x20(%rdx),%xmm0 12436320Sbholler add $0x20,%rdx 12446320Sbholler 12456320Sbholler psrldq $0xa,%xmm1 12466320Sbholler movdqa %xmm3,%xmm2 12476320Sbholler pslldq $0x6,%xmm3 12486320Sbholler por %xmm1,%xmm3 12496320Sbholler 12506320Sbholler psrldq $0xa,%xmm2 12516320Sbholler movdqa %xmm0,%xmm1 12526320Sbholler pslldq $0x6,%xmm0 12536320Sbholler por %xmm2,%xmm0 12546320Sbholler movdqa %xmm3,(%rcx) 12556320Sbholler movdqa %xmm0,0x10(%rcx) 12566320Sbholler 12576320Sbholler add $0x20,%rcx 12586320Sbholler cmp $0x20,%r8 12596320Sbholler jge L(movdqa10) 12606320Sbholler jmp L(movdqa_epi) 12616320Sbholler 12626320Sbholler .balign 16 12636320SbhollerL(movdqa11): 12646320Sbholler sub $0x20,%r8 12656320Sbholler movdqa 0x10(%rdx),%xmm3 12666320Sbholler movdqa 0x20(%rdx),%xmm0 12676320Sbholler add $0x20,%rdx 12686320Sbholler 12696320Sbholler psrldq $0xb,%xmm1 12706320Sbholler movdqa %xmm3,%xmm2 12716320Sbholler pslldq $0x5,%xmm3 12726320Sbholler por %xmm1,%xmm3 12736320Sbholler 12746320Sbholler psrldq $0xb,%xmm2 12756320Sbholler movdqa %xmm0,%xmm1 12766320Sbholler pslldq $0x5,%xmm0 12776320Sbholler por %xmm2,%xmm0 12786320Sbholler movdqa %xmm3,(%rcx) 12796320Sbholler movdqa %xmm0,0x10(%rcx) 12806320Sbholler 12816320Sbholler add $0x20,%rcx 12826320Sbholler cmp $0x20,%r8 12836320Sbholler jge L(movdqa11) 12846320Sbholler jmp L(movdqa_epi) 12856320Sbholler 12866320Sbholler .balign 16 12876320SbhollerL(movdqa12): 12886320Sbholler sub $0x20,%r8 12896320Sbholler movdqa 0x10(%rdx),%xmm3 12906320Sbholler movdqa 0x20(%rdx),%xmm0 12916320Sbholler add $0x20,%rdx 12926320Sbholler 12936320Sbholler psrldq $0xc,%xmm1 12946320Sbholler movdqa %xmm3,%xmm2 12956320Sbholler pslldq $0x4,%xmm3 12966320Sbholler por %xmm1,%xmm3 12976320Sbholler 12986320Sbholler psrldq $0xc,%xmm2 12996320Sbholler movdqa %xmm0,%xmm1 13006320Sbholler pslldq $0x4,%xmm0 13016320Sbholler por %xmm2,%xmm0 13026320Sbholler movdqa %xmm3,(%rcx) 13036320Sbholler movdqa %xmm0,0x10(%rcx) 13046320Sbholler 13056320Sbholler add $0x20,%rcx 13066320Sbholler cmp $0x20,%r8 13076320Sbholler jge L(movdqa12) 13086320Sbholler jmp L(movdqa_epi) 13096320Sbholler 13106320Sbholler .balign 16 13116320SbhollerL(movdqa13): 13126320Sbholler sub $0x20,%r8 13136320Sbholler movdqa 0x10(%rdx),%xmm3 13146320Sbholler movdqa 0x20(%rdx),%xmm0 13156320Sbholler add $0x20,%rdx 13166320Sbholler 13176320Sbholler psrldq $0xd,%xmm1 13186320Sbholler movdqa %xmm3,%xmm2 13196320Sbholler pslldq $0x3,%xmm3 13206320Sbholler por %xmm1,%xmm3 13216320Sbholler 13226320Sbholler psrldq $0xd,%xmm2 13236320Sbholler movdqa %xmm0,%xmm1 13246320Sbholler pslldq $0x3,%xmm0 13256320Sbholler por %xmm2,%xmm0 13266320Sbholler movdqa %xmm3,(%rcx) 13276320Sbholler movdqa %xmm0,0x10(%rcx) 13286320Sbholler 13296320Sbholler add $0x20,%rcx 13306320Sbholler cmp $0x20,%r8 13316320Sbholler jge L(movdqa13) 13326320Sbholler jmp L(movdqa_epi) 13336320Sbholler 13346320Sbholler .balign 16 13356320SbhollerL(movdqa14): 13366320Sbholler sub $0x20,%r8 13376320Sbholler movdqa 0x10(%rdx),%xmm3 13386320Sbholler movdqa 0x20(%rdx),%xmm0 13396320Sbholler add $0x20,%rdx 13406320Sbholler 13416320Sbholler psrldq $0xe,%xmm1 13426320Sbholler movdqa %xmm3,%xmm2 13436320Sbholler pslldq $0x2,%xmm3 13446320Sbholler por %xmm1,%xmm3 13456320Sbholler 13466320Sbholler psrldq $0xe,%xmm2 13476320Sbholler movdqa %xmm0,%xmm1 13486320Sbholler pslldq $0x2,%xmm0 13496320Sbholler por %xmm2,%xmm0 13506320Sbholler movdqa %xmm3,(%rcx) 13516320Sbholler movdqa %xmm0,0x10(%rcx) 13526320Sbholler 13536320Sbholler add $0x20,%rcx 13546320Sbholler cmp $0x20,%r8 13556320Sbholler jge L(movdqa14) 13566320Sbholler jmp L(movdqa_epi) 13576320Sbholler 13586320Sbholler .balign 16 13596320SbhollerL(movdqa15): 13606320Sbholler sub $0x20,%r8 13616320Sbholler movdqa 0x10(%rdx),%xmm3 13626320Sbholler movdqa 0x20(%rdx),%xmm0 13636320Sbholler add $0x20,%rdx 13646320Sbholler 13656320Sbholler psrldq $0xf,%xmm1 13666320Sbholler movdqa %xmm3,%xmm2 13676320Sbholler pslldq $0x1,%xmm3 13686320Sbholler por %xmm1,%xmm3 13696320Sbholler 13706320Sbholler psrldq $0xf,%xmm2 13716320Sbholler movdqa %xmm0,%xmm1 13726320Sbholler pslldq $0x1,%xmm0 13736320Sbholler por %xmm2,%xmm0 13746320Sbholler movdqa %xmm3,(%rcx) 13756320Sbholler movdqa %xmm0,0x10(%rcx) 13766320Sbholler 13776320Sbholler add $0x20,%rcx 13786320Sbholler cmp $0x20,%r8 13796320Sbholler jge L(movdqa15) 13806320Sbholler #jmp L(movdqa_epi) 13816320Sbholler 13826320Sbholler .balign 16 13836320SbhollerL(movdqa_epi): 13846320Sbholler lea L(fwdPxQx)(%rip),%r10 13856320Sbholler add %r11,%rdx # bump rdx to the right addr (it lagged behind in the above loop) 13866320Sbholler add %r8,%rcx 13876320Sbholler add %r8,%rdx 13886320Sbholler 13896320Sbholler movslq (%r10,%r8,4),%r9 13906320Sbholler lea (%r9,%r10,1),%r10 13916320Sbholler jmpq *%r10 13926320Sbholler 13936320Sbholler .balign 16 13946320SbhollerL(mov3dqa1): 13956320Sbholler movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 13966320Sbholler sub $0x30,%r8 13976320Sbholler movdqa 0x20(%rdx),%xmm0 # load the upper source buffer 13986320Sbholler movdqa 0x30(%rdx),%xmm5 # load the upper source buffer 13996320Sbholler lea 0x30(%rdx),%rdx 14006320Sbholler cmp $0x30,%r8 14016320Sbholler 14026320Sbholler movdqa %xmm3,%xmm2 # store off xmm reg for use next iteration 14036320Sbholler #palignr $0x1,%xmm1,%xmm3 14046320Sbholler .byte 0x66,0x0f,0x3a,0x0f 14056320Sbholler .byte 0xd9,0x01 14066320Sbholler movdqa %xmm3,(%rcx) # store it 14076320Sbholler 14086320Sbholler movdqa %xmm0,%xmm4 # store off xmm reg for use next iteration 14096320Sbholler #palignr $0x1,%xmm2,%xmm0 14106320Sbholler .byte 0x66,0x0f,0x3a,0x0f 14116320Sbholler .byte 0xc2,0x01 14126320Sbholler movdqa %xmm0,0x10(%rcx) # store it 14136320Sbholler 14146320Sbholler movdqa %xmm5,%xmm1 # store off xmm reg for use next iteration 14156320Sbholler #palignr $0x1,%xmm4,%xmm5 14166320Sbholler .byte 0x66,0x0f,0x3a,0x0f 14176320Sbholler .byte 0xec,0x01 14186320Sbholler movdqa %xmm5,0x20(%rcx) # store it 14196320Sbholler 14206320Sbholler lea 0x30(%rcx),%rcx 14216320Sbholler jge L(mov3dqa1) 14226320Sbholler 14236320Sbholler cmp $0x10,%r8 14246320Sbholler jl L(movdqa_epi) 14256320Sbholler movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 14266320Sbholler sub $0x10,%r8 14276320Sbholler lea 0x10(%rdx),%rdx 14286320Sbholler movdqa %xmm3,%xmm2 # save for use next concat 14296320Sbholler #palignr $0x1,%xmm1,%xmm3 14306320Sbholler .byte 0x66,0x0f,0x3a,0x0f 14316320Sbholler .byte 0xd9,0x01 14326320Sbholler 14336320Sbholler cmp $0x10,%r8 14346320Sbholler movdqa %xmm3,(%rcx) # store it 14356320Sbholler lea 0x10(%rcx),%rcx 14366320Sbholler jl L(movdqa_epi) 14376320Sbholler 14386320Sbholler movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 14396320Sbholler sub $0x10,%r8 14406320Sbholler lea 0x10(%rdx),%rdx 14416320Sbholler #palignr $0x1,%xmm2,%xmm0 14426320Sbholler .byte 0x66,0x0f,0x3a,0x0f 14436320Sbholler .byte 0xc2,0x01 14446320Sbholler movdqa %xmm0,(%rcx) # store it 14456320Sbholler lea 0x10(%rcx),%rcx 14466320Sbholler jmp L(movdqa_epi) 14476320Sbholler 14486320Sbholler .balign 16 14496320SbhollerL(mov3dqa2): 14506320Sbholler movdqa 0x10(%rdx),%xmm3 14516320Sbholler sub $0x30,%r8 14526320Sbholler movdqa 0x20(%rdx),%xmm0 14536320Sbholler movdqa 0x30(%rdx),%xmm5 14546320Sbholler lea 0x30(%rdx),%rdx 14556320Sbholler cmp $0x30,%r8 14566320Sbholler 14576320Sbholler movdqa %xmm3,%xmm2 14586320Sbholler #palignr $0x2,%xmm1,%xmm3 14596320Sbholler .byte 0x66,0x0f,0x3a,0x0f 14606320Sbholler .byte 0xd9,0x02 14616320Sbholler movdqa %xmm3,(%rcx) 14626320Sbholler 14636320Sbholler movdqa %xmm0,%xmm4 14646320Sbholler #palignr $0x2,%xmm2,%xmm0 14656320Sbholler .byte 0x66,0x0f,0x3a,0x0f 14666320Sbholler .byte 0xc2,0x02 14676320Sbholler movdqa %xmm0,0x10(%rcx) 14686320Sbholler 14696320Sbholler movdqa %xmm5,%xmm1 14706320Sbholler #palignr $0x2,%xmm4,%xmm5 14716320Sbholler .byte 0x66,0x0f,0x3a,0x0f 14726320Sbholler .byte 0xec,0x02 14736320Sbholler movdqa %xmm5,0x20(%rcx) 14746320Sbholler 14756320Sbholler lea 0x30(%rcx),%rcx 14766320Sbholler jge L(mov3dqa2) 14776320Sbholler 14786320Sbholler cmp $0x10,%r8 14796320Sbholler jl L(movdqa_epi) 14806320Sbholler movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 14816320Sbholler sub $0x10,%r8 14826320Sbholler lea 0x10(%rdx),%rdx 14836320Sbholler movdqa %xmm3,%xmm2 # save for use next concat 14846320Sbholler #palignr $0x2,%xmm1,%xmm3 14856320Sbholler .byte 0x66,0x0f,0x3a,0x0f 14866320Sbholler .byte 0xd9,0x02 14876320Sbholler 14886320Sbholler cmp $0x10,%r8 14896320Sbholler movdqa %xmm3,(%rcx) # store it 14906320Sbholler lea 0x10(%rcx),%rcx 14916320Sbholler jl L(movdqa_epi) 14926320Sbholler 14936320Sbholler movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 14946320Sbholler sub $0x10,%r8 14956320Sbholler lea 0x10(%rdx),%rdx 14966320Sbholler #palignr $0x2,%xmm2,%xmm0 14976320Sbholler .byte 0x66,0x0f,0x3a,0x0f 14986320Sbholler .byte 0xc2,0x02 14996320Sbholler movdqa %xmm0,(%rcx) # store it 15006320Sbholler lea 0x10(%rcx),%rcx 15016320Sbholler jmp L(movdqa_epi) 15026320Sbholler 15036320Sbholler .balign 16 15046320SbhollerL(mov3dqa3): 15056320Sbholler movdqa 0x10(%rdx),%xmm3 15066320Sbholler sub $0x30,%r8 15076320Sbholler movdqa 0x20(%rdx),%xmm0 15086320Sbholler movdqa 0x30(%rdx),%xmm5 15096320Sbholler lea 0x30(%rdx),%rdx 15106320Sbholler cmp $0x30,%r8 15116320Sbholler 15126320Sbholler movdqa %xmm3,%xmm2 15136320Sbholler #palignr $0x3,%xmm1,%xmm3 15146320Sbholler .byte 0x66,0x0f,0x3a,0x0f 15156320Sbholler .byte 0xd9,0x03 15166320Sbholler movdqa %xmm3,(%rcx) 15176320Sbholler 15186320Sbholler movdqa %xmm0,%xmm4 15196320Sbholler #palignr $0x3,%xmm2,%xmm0 15206320Sbholler .byte 0x66,0x0f,0x3a,0x0f 15216320Sbholler .byte 0xc2,0x03 15226320Sbholler movdqa %xmm0,0x10(%rcx) 15236320Sbholler 15246320Sbholler movdqa %xmm5,%xmm1 15256320Sbholler #palignr $0x3,%xmm4,%xmm5 15266320Sbholler .byte 0x66,0x0f,0x3a,0x0f 15276320Sbholler .byte 0xec,0x03 15286320Sbholler movdqa %xmm5,0x20(%rcx) 15296320Sbholler 15306320Sbholler lea 0x30(%rcx),%rcx 15316320Sbholler jge L(mov3dqa3) 15326320Sbholler 15336320Sbholler cmp $0x10,%r8 15346320Sbholler jl L(movdqa_epi) 15356320Sbholler movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 15366320Sbholler sub $0x10,%r8 15376320Sbholler lea 0x10(%rdx),%rdx 15386320Sbholler movdqa %xmm3,%xmm2 # save for use next concat 15396320Sbholler #palignr $0x3,%xmm1,%xmm3 15406320Sbholler .byte 0x66,0x0f,0x3a,0x0f 15416320Sbholler .byte 0xd9,0x03 15426320Sbholler 15436320Sbholler cmp $0x10,%r8 15446320Sbholler movdqa %xmm3,(%rcx) # store it 15456320Sbholler lea 0x10(%rcx),%rcx 15466320Sbholler jl L(movdqa_epi) 15476320Sbholler 15486320Sbholler movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 15496320Sbholler sub $0x10,%r8 15506320Sbholler lea 0x10(%rdx),%rdx 15516320Sbholler #palignr $0x3,%xmm2,%xmm0 15526320Sbholler .byte 0x66,0x0f,0x3a,0x0f 15536320Sbholler .byte 0xc2,0x03 15546320Sbholler movdqa %xmm0,(%rcx) # store it 15556320Sbholler lea 0x10(%rcx),%rcx 15566320Sbholler jmp L(movdqa_epi) 15576320Sbholler 15586320Sbholler .balign 16 15596320SbhollerL(mov3dqa4): 15606320Sbholler movdqa 0x10(%rdx),%xmm3 15616320Sbholler sub $0x30,%r8 15626320Sbholler movdqa 0x20(%rdx),%xmm0 15636320Sbholler movdqa 0x30(%rdx),%xmm5 15646320Sbholler lea 0x30(%rdx),%rdx 15656320Sbholler cmp $0x30,%r8 15666320Sbholler 15676320Sbholler movdqa %xmm3,%xmm2 15686320Sbholler #palignr $0x4,%xmm1,%xmm3 15696320Sbholler .byte 0x66,0x0f,0x3a,0x0f 15706320Sbholler .byte 0xd9,0x04 15716320Sbholler movdqa %xmm3,(%rcx) 15726320Sbholler 15736320Sbholler movdqa %xmm0,%xmm4 15746320Sbholler #palignr $0x4,%xmm2,%xmm0 15756320Sbholler .byte 0x66,0x0f,0x3a,0x0f 15766320Sbholler .byte 0xc2,0x04 15776320Sbholler movdqa %xmm0,0x10(%rcx) 15786320Sbholler 15796320Sbholler movdqa %xmm5,%xmm1 15806320Sbholler #palignr $0x4,%xmm4,%xmm5 15816320Sbholler .byte 0x66,0x0f,0x3a,0x0f 15826320Sbholler .byte 0xec,0x04 15836320Sbholler movdqa %xmm5,0x20(%rcx) 15846320Sbholler 15856320Sbholler lea 0x30(%rcx),%rcx 15866320Sbholler jge L(mov3dqa4) 15876320Sbholler 15886320Sbholler cmp $0x10,%r8 15896320Sbholler jl L(movdqa_epi) 15906320Sbholler movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 15916320Sbholler sub $0x10,%r8 15926320Sbholler lea 0x10(%rdx),%rdx 15936320Sbholler movdqa %xmm3,%xmm2 # save for use next concat 15946320Sbholler #palignr $0x4,%xmm1,%xmm3 15956320Sbholler .byte 0x66,0x0f,0x3a,0x0f 15966320Sbholler .byte 0xd9,0x04 15976320Sbholler 15986320Sbholler cmp $0x10,%r8 15996320Sbholler movdqa %xmm3,(%rcx) # store it 16006320Sbholler lea 0x10(%rcx),%rcx 16016320Sbholler jl L(movdqa_epi) 16026320Sbholler 16036320Sbholler movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 16046320Sbholler sub $0x10,%r8 16056320Sbholler lea 0x10(%rdx),%rdx 16066320Sbholler #palignr $0x4,%xmm2,%xmm0 16076320Sbholler .byte 0x66,0x0f,0x3a,0x0f 16086320Sbholler .byte 0xc2,0x04 16096320Sbholler movdqa %xmm0,(%rcx) # store it 16106320Sbholler lea 0x10(%rcx),%rcx 16116320Sbholler jmp L(movdqa_epi) 16126320Sbholler 16136320Sbholler .balign 16 16146320SbhollerL(mov3dqa5): 16156320Sbholler movdqa 0x10(%rdx),%xmm3 16166320Sbholler sub $0x30,%r8 16176320Sbholler movdqa 0x20(%rdx),%xmm0 16186320Sbholler movdqa 0x30(%rdx),%xmm5 16196320Sbholler lea 0x30(%rdx),%rdx 16206320Sbholler cmp $0x30,%r8 16216320Sbholler 16226320Sbholler movdqa %xmm3,%xmm2 16236320Sbholler #palignr $0x5,%xmm1,%xmm3 16246320Sbholler .byte 0x66,0x0f,0x3a,0x0f 16256320Sbholler .byte 0xd9,0x05 16266320Sbholler movdqa %xmm3,(%rcx) 16276320Sbholler 16286320Sbholler movdqa %xmm0,%xmm4 16296320Sbholler #palignr $0x5,%xmm2,%xmm0 16306320Sbholler .byte 0x66,0x0f,0x3a,0x0f 16316320Sbholler .byte 0xc2,0x05 16326320Sbholler movdqa %xmm0,0x10(%rcx) 16336320Sbholler 16346320Sbholler movdqa %xmm5,%xmm1 16356320Sbholler #palignr $0x5,%xmm4,%xmm5 16366320Sbholler .byte 0x66,0x0f,0x3a,0x0f 16376320Sbholler .byte 0xec,0x05 16386320Sbholler movdqa %xmm5,0x20(%rcx) 16396320Sbholler 16406320Sbholler lea 0x30(%rcx),%rcx 16416320Sbholler jge L(mov3dqa5) 16426320Sbholler 16436320Sbholler cmp $0x10,%r8 16446320Sbholler jl L(movdqa_epi) 16456320Sbholler movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 16466320Sbholler sub $0x10,%r8 16476320Sbholler lea 0x10(%rdx),%rdx 16486320Sbholler movdqa %xmm3,%xmm2 # save for use next concat 16496320Sbholler #palignr $0x5,%xmm1,%xmm3 16506320Sbholler .byte 0x66,0x0f,0x3a,0x0f 16516320Sbholler .byte 0xd9,0x05 16526320Sbholler 16536320Sbholler cmp $0x10,%r8 16546320Sbholler movdqa %xmm3,(%rcx) # store it 16556320Sbholler lea 0x10(%rcx),%rcx 16566320Sbholler jl L(movdqa_epi) 16576320Sbholler 16586320Sbholler movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 16596320Sbholler sub $0x10,%r8 16606320Sbholler lea 0x10(%rdx),%rdx 16616320Sbholler #palignr $0x5,%xmm2,%xmm0 16626320Sbholler .byte 0x66,0x0f,0x3a,0x0f 16636320Sbholler .byte 0xc2,0x05 16646320Sbholler movdqa %xmm0,(%rcx) # store it 16656320Sbholler lea 0x10(%rcx),%rcx 16666320Sbholler jmp L(movdqa_epi) 16676320Sbholler 16686320Sbholler .balign 16 16696320SbhollerL(mov3dqa6): 16706320Sbholler movdqa 0x10(%rdx),%xmm3 16716320Sbholler sub $0x30,%r8 16726320Sbholler movdqa 0x20(%rdx),%xmm0 16736320Sbholler movdqa 0x30(%rdx),%xmm5 16746320Sbholler lea 0x30(%rdx),%rdx 16756320Sbholler cmp $0x30,%r8 16766320Sbholler 16776320Sbholler movdqa %xmm3,%xmm2 16786320Sbholler #palignr $0x6,%xmm1,%xmm3 16796320Sbholler .byte 0x66,0x0f,0x3a,0x0f 16806320Sbholler .byte 0xd9,0x06 16816320Sbholler movdqa %xmm3,(%rcx) 16826320Sbholler 16836320Sbholler movdqa %xmm0,%xmm4 16846320Sbholler #palignr $0x6,%xmm2,%xmm0 16856320Sbholler .byte 0x66,0x0f,0x3a,0x0f 16866320Sbholler .byte 0xc2,0x06 16876320Sbholler movdqa %xmm0,0x10(%rcx) 16886320Sbholler 16896320Sbholler movdqa %xmm5,%xmm1 16906320Sbholler #palignr $0x6,%xmm4,%xmm5 16916320Sbholler .byte 0x66,0x0f,0x3a,0x0f 16926320Sbholler .byte 0xec,0x06 16936320Sbholler movdqa %xmm5,0x20(%rcx) 16946320Sbholler 16956320Sbholler lea 0x30(%rcx),%rcx 16966320Sbholler jge L(mov3dqa6) 16976320Sbholler 16986320Sbholler cmp $0x10,%r8 16996320Sbholler jl L(movdqa_epi) 17006320Sbholler movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 17016320Sbholler sub $0x10,%r8 17026320Sbholler lea 0x10(%rdx),%rdx 17036320Sbholler movdqa %xmm3,%xmm2 # save for use next concat 17046320Sbholler #palignr $0x6,%xmm1,%xmm3 17056320Sbholler .byte 0x66,0x0f,0x3a,0x0f 17066320Sbholler .byte 0xd9,0x06 17076320Sbholler 17086320Sbholler cmp $0x10,%r8 17096320Sbholler movdqa %xmm3,(%rcx) # store it 17106320Sbholler lea 0x10(%rcx),%rcx 17116320Sbholler jl L(movdqa_epi) 17126320Sbholler 17136320Sbholler movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 17146320Sbholler sub $0x10,%r8 17156320Sbholler lea 0x10(%rdx),%rdx 17166320Sbholler #palignr $0x6,%xmm2,%xmm0 17176320Sbholler .byte 0x66,0x0f,0x3a,0x0f 17186320Sbholler .byte 0xc2,0x06 17196320Sbholler movdqa %xmm0,(%rcx) # store it 17206320Sbholler lea 0x10(%rcx),%rcx 17216320Sbholler jmp L(movdqa_epi) 17226320Sbholler 17236320Sbholler .balign 16 17246320SbhollerL(mov3dqa7): 17256320Sbholler movdqa 0x10(%rdx),%xmm3 17266320Sbholler sub $0x30,%r8 17276320Sbholler movdqa 0x20(%rdx),%xmm0 17286320Sbholler movdqa 0x30(%rdx),%xmm5 17296320Sbholler lea 0x30(%rdx),%rdx 17306320Sbholler cmp $0x30,%r8 17316320Sbholler 17326320Sbholler movdqa %xmm3,%xmm2 17336320Sbholler #palignr $0x7,%xmm1,%xmm3 17346320Sbholler .byte 0x66,0x0f,0x3a,0x0f 17356320Sbholler .byte 0xd9,0x07 17366320Sbholler movdqa %xmm3,(%rcx) 17376320Sbholler 17386320Sbholler movdqa %xmm0,%xmm4 17396320Sbholler #palignr $0x7,%xmm2,%xmm0 17406320Sbholler .byte 0x66,0x0f,0x3a,0x0f 17416320Sbholler .byte 0xc2,0x07 17426320Sbholler movdqa %xmm0,0x10(%rcx) 17436320Sbholler 17446320Sbholler movdqa %xmm5,%xmm1 17456320Sbholler #palignr $0x7,%xmm4,%xmm5 17466320Sbholler .byte 0x66,0x0f,0x3a,0x0f 17476320Sbholler .byte 0xec,0x07 17486320Sbholler movdqa %xmm5,0x20(%rcx) 17496320Sbholler 17506320Sbholler lea 0x30(%rcx),%rcx 17516320Sbholler jge L(mov3dqa7) 17526320Sbholler 17536320Sbholler cmp $0x10,%r8 17546320Sbholler jl L(movdqa_epi) 17556320Sbholler movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 17566320Sbholler sub $0x10,%r8 17576320Sbholler lea 0x10(%rdx),%rdx 17586320Sbholler movdqa %xmm3,%xmm2 # save for use next concat 17596320Sbholler #palignr $0x7,%xmm1,%xmm3 17606320Sbholler .byte 0x66,0x0f,0x3a,0x0f 17616320Sbholler .byte 0xd9,0x07 17626320Sbholler 17636320Sbholler cmp $0x10,%r8 17646320Sbholler movdqa %xmm3,(%rcx) # store it 17656320Sbholler lea 0x10(%rcx),%rcx 17666320Sbholler jl L(movdqa_epi) 17676320Sbholler 17686320Sbholler movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 17696320Sbholler sub $0x10,%r8 17706320Sbholler lea 0x10(%rdx),%rdx 17716320Sbholler #palignr $0x7,%xmm2,%xmm0 17726320Sbholler .byte 0x66,0x0f,0x3a,0x0f 17736320Sbholler .byte 0xc2,0x07 17746320Sbholler movdqa %xmm0,(%rcx) # store it 17756320Sbholler lea 0x10(%rcx),%rcx 17766320Sbholler jmp L(movdqa_epi) 17776320Sbholler 17786320Sbholler .balign 16 17796320SbhollerL(mov3dqa9): 17806320Sbholler movdqa 0x10(%rdx),%xmm3 17816320Sbholler sub $0x30,%r8 17826320Sbholler movdqa 0x20(%rdx),%xmm0 17836320Sbholler movdqa 0x30(%rdx),%xmm5 17846320Sbholler lea 0x30(%rdx),%rdx 17856320Sbholler cmp $0x30,%r8 17866320Sbholler 17876320Sbholler movdqa %xmm3,%xmm2 17886320Sbholler #palignr $0x9,%xmm1,%xmm3 17896320Sbholler .byte 0x66,0x0f,0x3a,0x0f 17906320Sbholler .byte 0xd9,0x09 17916320Sbholler movdqa %xmm3,(%rcx) 17926320Sbholler 17936320Sbholler movdqa %xmm0,%xmm4 17946320Sbholler #palignr $0x9,%xmm2,%xmm0 17956320Sbholler .byte 0x66,0x0f,0x3a,0x0f 17966320Sbholler .byte 0xc2,0x09 17976320Sbholler movdqa %xmm0,0x10(%rcx) 17986320Sbholler 17996320Sbholler movdqa %xmm5,%xmm1 18006320Sbholler #palignr $0x9,%xmm4,%xmm5 18016320Sbholler .byte 0x66,0x0f,0x3a,0x0f 18026320Sbholler .byte 0xec,0x09 18036320Sbholler movdqa %xmm5,0x20(%rcx) 18046320Sbholler 18056320Sbholler lea 0x30(%rcx),%rcx 18066320Sbholler jge L(mov3dqa9) 18076320Sbholler 18086320Sbholler cmp $0x10,%r8 18096320Sbholler jl L(movdqa_epi) 18106320Sbholler movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 18116320Sbholler sub $0x10,%r8 18126320Sbholler lea 0x10(%rdx),%rdx 18136320Sbholler movdqa %xmm3,%xmm2 # save for use next concat 18146320Sbholler #palignr $0x9,%xmm1,%xmm3 18156320Sbholler .byte 0x66,0x0f,0x3a,0x0f 18166320Sbholler .byte 0xd9,0x09 18176320Sbholler 18186320Sbholler cmp $0x10,%r8 18196320Sbholler movdqa %xmm3,(%rcx) # store it 18206320Sbholler lea 0x10(%rcx),%rcx 18216320Sbholler jl L(movdqa_epi) 18226320Sbholler 18236320Sbholler movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 18246320Sbholler sub $0x10,%r8 18256320Sbholler lea 0x10(%rdx),%rdx 18266320Sbholler #palignr $0x9,%xmm2,%xmm0 18276320Sbholler .byte 0x66,0x0f,0x3a,0x0f 18286320Sbholler .byte 0xc2,0x09 18296320Sbholler movdqa %xmm0,(%rcx) # store it 18306320Sbholler lea 0x10(%rcx),%rcx 18316320Sbholler jmp L(movdqa_epi) 18326320Sbholler 18336320Sbholler .balign 16 18346320SbhollerL(mov3dqa10): 18356320Sbholler movdqa 0x10(%rdx),%xmm3 18366320Sbholler sub $0x30,%r8 18376320Sbholler movdqa 0x20(%rdx),%xmm0 18386320Sbholler movdqa 0x30(%rdx),%xmm5 18396320Sbholler lea 0x30(%rdx),%rdx 18406320Sbholler cmp $0x30,%r8 18416320Sbholler 18426320Sbholler movdqa %xmm3,%xmm2 18436320Sbholler #palignr $0xa,%xmm1,%xmm3 18446320Sbholler .byte 0x66,0x0f,0x3a,0x0f 18456320Sbholler .byte 0xd9,0x0a 18466320Sbholler movdqa %xmm3,(%rcx) 18476320Sbholler 18486320Sbholler movdqa %xmm0,%xmm4 18496320Sbholler #palignr $0xa,%xmm2,%xmm0 18506320Sbholler .byte 0x66,0x0f,0x3a,0x0f 18516320Sbholler .byte 0xc2,0x0a 18526320Sbholler movdqa %xmm0,0x10(%rcx) 18536320Sbholler 18546320Sbholler movdqa %xmm5,%xmm1 18556320Sbholler #palignr $0xa,%xmm4,%xmm5 18566320Sbholler .byte 0x66,0x0f,0x3a,0x0f 18576320Sbholler .byte 0xec,0x0a 18586320Sbholler movdqa %xmm5,0x20(%rcx) 18596320Sbholler 18606320Sbholler lea 0x30(%rcx),%rcx 18616320Sbholler jge L(mov3dqa10) 18626320Sbholler 18636320Sbholler cmp $0x10,%r8 18646320Sbholler jl L(movdqa_epi) 18656320Sbholler movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 18666320Sbholler sub $0x10,%r8 18676320Sbholler lea 0x10(%rdx),%rdx 18686320Sbholler movdqa %xmm3,%xmm2 # save for use next concat 18696320Sbholler #palignr $0xa,%xmm1,%xmm3 18706320Sbholler .byte 0x66,0x0f,0x3a,0x0f 18716320Sbholler .byte 0xd9,0x0a 18726320Sbholler 18736320Sbholler cmp $0x10,%r8 18746320Sbholler movdqa %xmm3,(%rcx) # store it 18756320Sbholler lea 0x10(%rcx),%rcx 18766320Sbholler jl L(movdqa_epi) 18776320Sbholler 18786320Sbholler movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 18796320Sbholler sub $0x10,%r8 18806320Sbholler lea 0x10(%rdx),%rdx 18816320Sbholler #palignr $0xa,%xmm2,%xmm0 18826320Sbholler .byte 0x66,0x0f,0x3a,0x0f 18836320Sbholler .byte 0xc2,0x0a 18846320Sbholler movdqa %xmm0,(%rcx) # store it 18856320Sbholler lea 0x10(%rcx),%rcx 18866320Sbholler jmp L(movdqa_epi) 18876320Sbholler 18886320Sbholler .balign 16 18896320SbhollerL(mov3dqa11): 18906320Sbholler movdqa 0x10(%rdx),%xmm3 18916320Sbholler sub $0x30,%r8 18926320Sbholler movdqa 0x20(%rdx),%xmm0 18936320Sbholler movdqa 0x30(%rdx),%xmm5 18946320Sbholler lea 0x30(%rdx),%rdx 18956320Sbholler cmp $0x30,%r8 18966320Sbholler 18976320Sbholler movdqa %xmm3,%xmm2 18986320Sbholler #palignr $0xb,%xmm1,%xmm3 18996320Sbholler .byte 0x66,0x0f,0x3a,0x0f 19006320Sbholler .byte 0xd9,0x0b 19016320Sbholler movdqa %xmm3,(%rcx) 19026320Sbholler 19036320Sbholler movdqa %xmm0,%xmm4 19046320Sbholler #palignr $0xb,%xmm2,%xmm0 19056320Sbholler .byte 0x66,0x0f,0x3a,0x0f 19066320Sbholler .byte 0xc2,0x0b 19076320Sbholler movdqa %xmm0,0x10(%rcx) 19086320Sbholler 19096320Sbholler movdqa %xmm5,%xmm1 19106320Sbholler #palignr $0xb,%xmm4,%xmm5 19116320Sbholler .byte 0x66,0x0f,0x3a,0x0f 19126320Sbholler .byte 0xec,0x0b 19136320Sbholler movdqa %xmm5,0x20(%rcx) 19146320Sbholler 19156320Sbholler lea 0x30(%rcx),%rcx 19166320Sbholler jge L(mov3dqa11) 19176320Sbholler 19186320Sbholler cmp $0x10,%r8 19196320Sbholler jl L(movdqa_epi) 19206320Sbholler movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 19216320Sbholler sub $0x10,%r8 19226320Sbholler lea 0x10(%rdx),%rdx 19236320Sbholler movdqa %xmm3,%xmm2 # save for use next concat 19246320Sbholler #palignr $0xb,%xmm1,%xmm3 19256320Sbholler .byte 0x66,0x0f,0x3a,0x0f 19266320Sbholler .byte 0xd9,0x0b 19276320Sbholler 19286320Sbholler cmp $0x10,%r8 19296320Sbholler movdqa %xmm3,(%rcx) # store it 19306320Sbholler lea 0x10(%rcx),%rcx 19316320Sbholler jl L(movdqa_epi) 19326320Sbholler 19336320Sbholler movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 19346320Sbholler sub $0x10,%r8 19356320Sbholler lea 0x10(%rdx),%rdx 19366320Sbholler #palignr $0xb,%xmm2,%xmm0 19376320Sbholler .byte 0x66,0x0f,0x3a,0x0f 19386320Sbholler .byte 0xc2,0x0b 19396320Sbholler movdqa %xmm0,(%rcx) # store it 19406320Sbholler lea 0x10(%rcx),%rcx 19416320Sbholler jmp L(movdqa_epi) 19426320Sbholler 19436320Sbholler .balign 16 19446320SbhollerL(mov3dqa12): 19456320Sbholler movdqa 0x10(%rdx),%xmm3 19466320Sbholler sub $0x30,%r8 19476320Sbholler movdqa 0x20(%rdx),%xmm0 19486320Sbholler movdqa 0x30(%rdx),%xmm5 19496320Sbholler lea 0x30(%rdx),%rdx 19506320Sbholler cmp $0x30,%r8 19516320Sbholler 19526320Sbholler movdqa %xmm3,%xmm2 19536320Sbholler #palignr $0xc,%xmm1,%xmm3 19546320Sbholler .byte 0x66,0x0f,0x3a,0x0f 19556320Sbholler .byte 0xd9,0x0c 19566320Sbholler movdqa %xmm3,(%rcx) 19576320Sbholler 19586320Sbholler movdqa %xmm0,%xmm4 19596320Sbholler #palignr $0xc,%xmm2,%xmm0 19606320Sbholler .byte 0x66,0x0f,0x3a,0x0f 19616320Sbholler .byte 0xc2,0x0c 19626320Sbholler movdqa %xmm0,0x10(%rcx) 19636320Sbholler 19646320Sbholler movdqa %xmm5,%xmm1 19656320Sbholler #palignr $0xc,%xmm4,%xmm5 19666320Sbholler .byte 0x66,0x0f,0x3a,0x0f 19676320Sbholler .byte 0xec,0x0c 19686320Sbholler movdqa %xmm5,0x20(%rcx) 19696320Sbholler 19706320Sbholler lea 0x30(%rcx),%rcx 19716320Sbholler jge L(mov3dqa12) 19726320Sbholler 19736320Sbholler cmp $0x10,%r8 19746320Sbholler jl L(movdqa_epi) 19756320Sbholler movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 19766320Sbholler sub $0x10,%r8 19776320Sbholler lea 0x10(%rdx),%rdx 19786320Sbholler movdqa %xmm3,%xmm2 # save for use next concat 19796320Sbholler #palignr $0xc,%xmm1,%xmm3 19806320Sbholler .byte 0x66,0x0f,0x3a,0x0f 19816320Sbholler .byte 0xd9,0x0c 19826320Sbholler 19836320Sbholler cmp $0x10,%r8 19846320Sbholler movdqa %xmm3,(%rcx) # store it 19856320Sbholler lea 0x10(%rcx),%rcx 19866320Sbholler jl L(movdqa_epi) 19876320Sbholler 19886320Sbholler movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 19896320Sbholler sub $0x10,%r8 19906320Sbholler lea 0x10(%rdx),%rdx 19916320Sbholler #palignr $0xc,%xmm2,%xmm0 19926320Sbholler .byte 0x66,0x0f,0x3a,0x0f 19936320Sbholler .byte 0xc2,0x0c 19946320Sbholler movdqa %xmm0,(%rcx) # store it 19956320Sbholler lea 0x10(%rcx),%rcx 19966320Sbholler jmp L(movdqa_epi) 19976320Sbholler 19986320Sbholler .balign 16 19996320SbhollerL(mov3dqa13): 20006320Sbholler movdqa 0x10(%rdx),%xmm3 20016320Sbholler sub $0x30,%r8 20026320Sbholler movdqa 0x20(%rdx),%xmm0 20036320Sbholler movdqa 0x30(%rdx),%xmm5 20046320Sbholler lea 0x30(%rdx),%rdx 20056320Sbholler cmp $0x30,%r8 20066320Sbholler 20076320Sbholler movdqa %xmm3,%xmm2 20086320Sbholler #palignr $0xd,%xmm1,%xmm3 20096320Sbholler .byte 0x66,0x0f,0x3a,0x0f 20106320Sbholler .byte 0xd9,0x0d 20116320Sbholler movdqa %xmm3,(%rcx) 20126320Sbholler 20136320Sbholler movdqa %xmm0,%xmm4 20146320Sbholler #palignr $0xd,%xmm2,%xmm0 20156320Sbholler .byte 0x66,0x0f,0x3a,0x0f 20166320Sbholler .byte 0xc2,0x0d 20176320Sbholler movdqa %xmm0,0x10(%rcx) 20186320Sbholler 20196320Sbholler movdqa %xmm5,%xmm1 20206320Sbholler #palignr $0xd,%xmm4,%xmm5 20216320Sbholler .byte 0x66,0x0f,0x3a,0x0f 20226320Sbholler .byte 0xec,0x0d 20236320Sbholler movdqa %xmm5,0x20(%rcx) 20246320Sbholler 20256320Sbholler lea 0x30(%rcx),%rcx 20266320Sbholler jge L(mov3dqa13) 20276320Sbholler 20286320Sbholler cmp $0x10,%r8 20296320Sbholler jl L(movdqa_epi) 20306320Sbholler movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 20316320Sbholler sub $0x10,%r8 20326320Sbholler lea 0x10(%rdx),%rdx 20336320Sbholler movdqa %xmm3,%xmm2 # save for use next concat 20346320Sbholler #palignr $0xd,%xmm1,%xmm3 20356320Sbholler .byte 0x66,0x0f,0x3a,0x0f 20366320Sbholler .byte 0xd9,0x0d 20376320Sbholler 20386320Sbholler cmp $0x10,%r8 20396320Sbholler movdqa %xmm3,(%rcx) # store it 20406320Sbholler lea 0x10(%rcx),%rcx 20416320Sbholler jl L(movdqa_epi) 20426320Sbholler 20436320Sbholler movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 20446320Sbholler sub $0x10,%r8 20456320Sbholler lea 0x10(%rdx),%rdx 20466320Sbholler #palignr $0xd,%xmm2,%xmm0 20476320Sbholler .byte 0x66,0x0f,0x3a,0x0f 20486320Sbholler .byte 0xc2,0x0d 20496320Sbholler movdqa %xmm0,(%rcx) # store it 20506320Sbholler lea 0x10(%rcx),%rcx 20516320Sbholler jmp L(movdqa_epi) 20526320Sbholler 20536320Sbholler .balign 16 20546320SbhollerL(mov3dqa14): 20556320Sbholler movdqa 0x10(%rdx),%xmm3 20566320Sbholler sub $0x30,%r8 20576320Sbholler movdqa 0x20(%rdx),%xmm0 20586320Sbholler movdqa 0x30(%rdx),%xmm5 20596320Sbholler lea 0x30(%rdx),%rdx 20606320Sbholler cmp $0x30,%r8 20616320Sbholler 20626320Sbholler movdqa %xmm3,%xmm2 20636320Sbholler #palignr $0xe,%xmm1,%xmm3 20646320Sbholler .byte 0x66,0x0f,0x3a,0x0f 20656320Sbholler .byte 0xd9,0x0e 20666320Sbholler movdqa %xmm3,(%rcx) 20676320Sbholler 20686320Sbholler movdqa %xmm0,%xmm4 20696320Sbholler #palignr $0xe,%xmm2,%xmm0 20706320Sbholler .byte 0x66,0x0f,0x3a,0x0f 20716320Sbholler .byte 0xc2,0x0e 20726320Sbholler movdqa %xmm0,0x10(%rcx) 20736320Sbholler 20746320Sbholler movdqa %xmm5,%xmm1 20756320Sbholler #palignr $0xe,%xmm4,%xmm5 20766320Sbholler .byte 0x66,0x0f,0x3a,0x0f 20776320Sbholler .byte 0xec,0x0e 20786320Sbholler movdqa %xmm5,0x20(%rcx) 20796320Sbholler 20806320Sbholler lea 0x30(%rcx),%rcx 20816320Sbholler jge L(mov3dqa14) 20826320Sbholler 20836320Sbholler cmp $0x10,%r8 20846320Sbholler jl L(movdqa_epi) 20856320Sbholler movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 20866320Sbholler sub $0x10,%r8 20876320Sbholler lea 0x10(%rdx),%rdx 20886320Sbholler movdqa %xmm3,%xmm2 # save for use next concat 20896320Sbholler #palignr $0xe,%xmm1,%xmm3 20906320Sbholler .byte 0x66,0x0f,0x3a,0x0f 20916320Sbholler .byte 0xd9,0x0e 20926320Sbholler 20936320Sbholler cmp $0x10,%r8 20946320Sbholler movdqa %xmm3,(%rcx) # store it 20956320Sbholler lea 0x10(%rcx),%rcx 20966320Sbholler jl L(movdqa_epi) 20976320Sbholler 20986320Sbholler movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 20996320Sbholler sub $0x10,%r8 21006320Sbholler lea 0x10(%rdx),%rdx 21016320Sbholler #palignr $0xe,%xmm2,%xmm0 21026320Sbholler .byte 0x66,0x0f,0x3a,0x0f 21036320Sbholler .byte 0xc2,0x0e 21046320Sbholler movdqa %xmm0,(%rcx) # store it 21056320Sbholler lea 0x10(%rcx),%rcx 21066320Sbholler jmp L(movdqa_epi) 21076320Sbholler 21086320Sbholler .balign 16 21096320SbhollerL(mov3dqa15): 21106320Sbholler movdqa 0x10(%rdx),%xmm3 21116320Sbholler sub $0x30,%r8 21126320Sbholler movdqa 0x20(%rdx),%xmm0 21136320Sbholler movdqa 0x30(%rdx),%xmm5 21146320Sbholler lea 0x30(%rdx),%rdx 21156320Sbholler cmp $0x30,%r8 21166320Sbholler 21176320Sbholler movdqa %xmm3,%xmm2 21186320Sbholler #palignr $0xf,%xmm1,%xmm3 21196320Sbholler .byte 0x66,0x0f,0x3a,0x0f 21206320Sbholler .byte 0xd9,0x0f 21216320Sbholler movdqa %xmm3,(%rcx) 21226320Sbholler 21236320Sbholler movdqa %xmm0,%xmm4 21246320Sbholler #palignr $0xf,%xmm2,%xmm0 21256320Sbholler .byte 0x66,0x0f,0x3a,0x0f 21266320Sbholler .byte 0xc2,0x0f 21276320Sbholler movdqa %xmm0,0x10(%rcx) 21286320Sbholler 21296320Sbholler movdqa %xmm5,%xmm1 21306320Sbholler #palignr $0xf,%xmm4,%xmm5 21316320Sbholler .byte 0x66,0x0f,0x3a,0x0f 21326320Sbholler .byte 0xec,0x0f 21336320Sbholler movdqa %xmm5,0x20(%rcx) 21346320Sbholler 21356320Sbholler lea 0x30(%rcx),%rcx 21366320Sbholler jge L(mov3dqa15) 21376320Sbholler 21386320Sbholler cmp $0x10,%r8 21396320Sbholler jl L(movdqa_epi) 21406320Sbholler movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 21416320Sbholler sub $0x10,%r8 21426320Sbholler lea 0x10(%rdx),%rdx 21436320Sbholler movdqa %xmm3,%xmm2 # save for use next concat 21446320Sbholler #palignr $0xf,%xmm1,%xmm3 21456320Sbholler .byte 0x66,0x0f,0x3a,0x0f 21466320Sbholler .byte 0xd9,0x0f 21476320Sbholler 21486320Sbholler cmp $0x10,%r8 21496320Sbholler movdqa %xmm3,(%rcx) # store it 21506320Sbholler lea 0x10(%rcx),%rcx 21516320Sbholler jl L(movdqa_epi) 21526320Sbholler 21536320Sbholler movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 21546320Sbholler sub $0x10,%r8 21556320Sbholler lea 0x10(%rdx),%rdx 21566320Sbholler #palignr $0xf,%xmm2,%xmm0 21576320Sbholler .byte 0x66,0x0f,0x3a,0x0f 21586320Sbholler .byte 0xc2,0x0f 21596320Sbholler movdqa %xmm0,(%rcx) # store it 21606320Sbholler lea 0x10(%rcx),%rcx 21616320Sbholler jmp L(movdqa_epi) 21626320Sbholler 21636320Sbholler .balign 16 21646320SbhollerL(sse2_nt_move): 21656320Sbholler lea 0x40(%rcx),%rcx 21666320Sbholler lea 0x40(%rdx),%rdx 21676320Sbholler lea -0x40(%r8),%r8 21686320Sbholler 21696320Sbholler /* 21706320Sbholler * doesn't matter if source is aligned for stuff out of cache. 21716320Sbholler * the mis-aligned penalty is masked by the slowness of main memory. 21726320Sbholler */ 21736320Sbholler prefetchnta 0x180(%rdx) 21746320Sbholler movdqu -0x40(%rdx),%xmm0 21756320Sbholler movdqu -0x30(%rdx),%xmm1 21766320Sbholler 21776320Sbholler cmp $0x40,%r8 21786320Sbholler movntdq %xmm0,-0x40(%rcx) 21796320Sbholler movntdq %xmm1,-0x30(%rcx) 21806320Sbholler 21816320Sbholler movdqu -0x20(%rdx),%xmm2 21826320Sbholler movdqu -0x10(%rdx),%xmm3 21836320Sbholler 21846320Sbholler movntdq %xmm2,-0x20(%rcx) 21856320Sbholler movntdq %xmm3,-0x10(%rcx) 21866320Sbholler 21876320Sbholler jge L(sse2_nt_move) 21886320Sbholler 21896320Sbholler lea L(Fix16EndTable)(%rip),%r10 21906320Sbholler mov %r8,%r9 21916320Sbholler and $0xFFFFFFFFFFFFFFF0,%r9 21926320Sbholler add %r9,%rcx 21936320Sbholler add %r9,%rdx 21946320Sbholler sub %r9,%r8 21956320Sbholler shr $0x4,%r9 21966320Sbholler sfence 21976320Sbholler 21986320Sbholler movslq (%r10,%r9,4),%r11 21996320Sbholler lea (%r11,%r10,1),%r10 22006320Sbholler jmpq *%r10 22016320Sbholler 22026320Sbholler .balign 16 22036320SbhollerL(Fix16EndTable): 22046320Sbholler .int L(fix16_0)-L(Fix16EndTable) 22056320Sbholler .int L(fix16_1)-L(Fix16EndTable) 22066320Sbholler .int L(fix16_2)-L(Fix16EndTable) 22076320Sbholler .int L(fix16_3)-L(Fix16EndTable) 22086320Sbholler 22096320Sbholler .balign 16 22106320SbhollerL(fix16_3): 22116320Sbholler movdqu -0x30(%rdx),%xmm1 22126320Sbholler movdqa %xmm1,-0x30(%rcx) 22136320SbhollerL(fix16_2): 22146320Sbholler movdqu -0x20(%rdx),%xmm2 22156320Sbholler movdqa %xmm2,-0x20(%rcx) 22166320SbhollerL(fix16_1): 22176320Sbholler movdqu -0x10(%rdx),%xmm3 22186320Sbholler movdqa %xmm3,-0x10(%rcx) 22196320SbhollerL(fix16_0): 22206320Sbholler lea L(fwdPxQx)(%rip),%r10 22216320Sbholler add %r8,%rdx 22226320Sbholler add %r8,%rcx 22236320Sbholler 22246320Sbholler movslq (%r10,%r8,4),%r9 22256320Sbholler lea (%r9,%r10,1),%r10 22266320Sbholler jmpq *%r10 22276320Sbholler 22286320Sbholler .balign 16 22296320SbhollerL(pre_both_aligned): 22306320Sbholler cmp $0x80,%r8 22316320Sbholler jl L(fix_16b) 22326320Sbholler 22336320Sbholler .balign 16 22346320SbhollerL(both_aligned): 22356320Sbholler 22366320Sbholler /* 22376320Sbholler * this 'paired' load/load/store/store seems to do best. 22386320Sbholler */ 22396320Sbholler movdqa (%rdx),%xmm0 22406320Sbholler movdqa 0x10(%rdx),%xmm1 22416320Sbholler 22426320Sbholler movdqa %xmm0,(%rcx) 22436320Sbholler movdqa %xmm1,0x10(%rcx) 22446320Sbholler lea -0x80(%r8),%r8 22456320Sbholler 22466320Sbholler movdqa 0x20(%rdx),%xmm2 22476320Sbholler movdqa 0x30(%rdx),%xmm3 22486320Sbholler 22496320Sbholler movdqa %xmm2,0x20(%rcx) 22506320Sbholler movdqa %xmm3,0x30(%rcx) 22516320Sbholler 22526320Sbholler movdqa 0x40(%rdx),%xmm0 22536320Sbholler movdqa 0x50(%rdx),%xmm1 22546320Sbholler cmp $0x80,%r8 22556320Sbholler 22566320Sbholler movdqa %xmm0,0x40(%rcx) 22576320Sbholler movdqa %xmm1,0x50(%rcx) 22586320Sbholler 22596320Sbholler movdqa 0x60(%rdx),%xmm2 22606320Sbholler movdqa 0x70(%rdx),%xmm3 22616320Sbholler lea 0x80(%rdx),%rdx 22626320Sbholler movdqa %xmm2,0x60(%rcx) 22636320Sbholler movdqa %xmm3,0x70(%rcx) 22646320Sbholler lea 0x80(%rcx),%rcx 22656320Sbholler jge L(both_aligned) 22666320Sbholler 22676320SbhollerL(fix_16b): 22686320Sbholler add %r8,%rcx 22696320Sbholler lea L(fwdPxQx)(%rip),%r10 22706320Sbholler add %r8,%rdx 22716320Sbholler 22726320Sbholler movslq (%r10,%r8,4),%r9 22736320Sbholler lea (%r9,%r10,1),%r10 22746320Sbholler jmpq *%r10 22756320Sbholler 22766320Sbholler .balign 16 22776320SbhollerL(Loop8byte_pre): 22786320Sbholler # Use 8-byte moves 22796320Sbholler mov .largest_level_cache_size(%rip),%r9d 22806320Sbholler shr %r9 # take half of it 22816320Sbholler cmp %r9,%r8 2282*10024Sbostrovs jge L(byte8_nt_top) 22836320Sbholler # Find out whether to use rep movsq 22846320Sbholler cmp $4096,%r8 22856320Sbholler jle L(byte8_top) 22866320Sbholler mov .amd64cache1half(%rip),%r9d # half of l1 cache 22876320Sbholler cmp %r9,%r8 22886320Sbholler jle L(use_rep) 22896320Sbholler 22906320Sbholler .balign 16 22916320SbhollerL(byte8_top): 22926320Sbholler mov (%rdx),%r9 22936320Sbholler mov 0x8(%rdx),%r10 22946320Sbholler lea -0x40(%r8),%r8 22956320Sbholler mov %r9,(%rcx) 22966320Sbholler mov %r10,0x8(%rcx) 22976320Sbholler mov 0x10(%rdx),%r11 22986320Sbholler mov 0x18(%rdx),%r9 22996320Sbholler mov %r11,0x10(%rcx) 23006320Sbholler mov %r9,0x18(%rcx) 23016320Sbholler 23026320Sbholler cmp $0x40,%r8 23036320Sbholler mov 0x20(%rdx),%r10 23046320Sbholler mov 0x28(%rdx),%r11 23056320Sbholler mov %r10,0x20(%rcx) 23066320Sbholler mov %r11,0x28(%rcx) 23076320Sbholler mov 0x30(%rdx),%r9 23086320Sbholler mov 0x38(%rdx),%r10 23096320Sbholler lea 0x40(%rdx),%rdx 23106320Sbholler mov %r9,0x30(%rcx) 23116320Sbholler mov %r10,0x38(%rcx) 23126320Sbholler lea 0x40(%rcx),%rcx 23136320Sbholler jg L(byte8_top) 23146320Sbholler 23156320SbhollerL(byte8_end): 23166320Sbholler lea L(fwdPxQx)(%rip),%r10 23176320Sbholler lea (%rdx,%r8,1),%rdx 23186320Sbholler lea (%rcx,%r8,1),%rcx 23196320Sbholler 23206320Sbholler movslq (%r10,%r8,4),%r9 23216320Sbholler lea (%r9,%r10,1),%r10 23226320Sbholler jmpq *%r10 23236320Sbholler 23246320Sbholler .balign 16 23256320SbhollerL(use_rep): 23266320Sbholler mov %rdx,%rsi # %rsi = source 23276320Sbholler mov %rcx,%rdi # %rdi = destination 23286320Sbholler mov %r8,%rcx # %rcx = count 23296320Sbholler shrq $3,%rcx # 8-byte word count 23300Sstevel@tonic-gate rep 23316320Sbholler movsq 23326320Sbholler mov %rsi,%rdx # source 23336320Sbholler mov %rdi,%rcx # destination 23346320Sbholler andq $7,%r8 # remainder 23356320Sbholler jnz L(byte8_end) 23360Sstevel@tonic-gate ret 23370Sstevel@tonic-gate 23386320Sbholler .balign 16 23396320SbhollerL(byte8_nt_top): 23406320Sbholler sub $0x40,%r8 23416320Sbholler prefetchnta 0x180(%rdx) 23426320Sbholler mov (%rdx),%r9 23436320Sbholler movnti %r9,(%rcx) 23446320Sbholler mov 0x8(%rdx),%r10 23456320Sbholler movnti %r10,0x8(%rcx) 23466320Sbholler mov 0x10(%rdx),%r11 23476320Sbholler movnti %r11,0x10(%rcx) 23486320Sbholler mov 0x18(%rdx),%r9 23496320Sbholler movnti %r9,0x18(%rcx) 23506320Sbholler mov 0x20(%rdx),%r10 23516320Sbholler movnti %r10,0x20(%rcx) 23526320Sbholler mov 0x28(%rdx),%r11 23536320Sbholler movnti %r11,0x28(%rcx) 23546320Sbholler mov 0x30(%rdx),%r9 23556320Sbholler movnti %r9,0x30(%rcx) 23566320Sbholler mov 0x38(%rdx),%r10 23576320Sbholler movnti %r10,0x38(%rcx) 23586320Sbholler 23596320Sbholler lea 0x40(%rdx),%rdx 23606320Sbholler lea 0x40(%rcx),%rcx 23616320Sbholler cmp $0x40,%r8 23626320Sbholler jge L(byte8_nt_top) 23636320Sbholler sfence 23646320Sbholler jmp L(byte8_end) 23656320Sbholler 23666320Sbholler SET_SIZE(memcpy) 23676320Sbholler 23686320Sbholler .balign 16 23696320SbhollerL(CopyBackwards): 23706320Sbholler mov %rdx,%r8 23716320Sbholler mov %rdi,%rcx 23726320Sbholler mov %rsi,%rdx 23736320Sbholler mov %rdi,%rax # return value 23746320Sbholler 23756320Sbholler # ck alignment of last byte 23766320Sbholler lea (%rcx,%r8,1),%rcx 23776320Sbholler test $0x7,%rcx 23786320Sbholler lea (%rdx,%r8,1),%rdx 23796320Sbholler jne L(bk_align) 23806320Sbholler 23816320SbhollerL(bk_qw_aligned): 23826320Sbholler lea L(bkPxQx)(%rip),%r10 23836320Sbholler 23846320Sbholler cmp $0x90,%r8 # 144 23856320Sbholler jg L(bk_ck_sse2_alignment) 23866320Sbholler 23876320Sbholler sub %r8,%rcx 23886320Sbholler sub %r8,%rdx 23896320Sbholler 23906320Sbholler movslq (%r10,%r8,4),%r9 23916320Sbholler lea (%r9,%r10,1),%r10 23926320Sbholler jmpq *%r10 23936320Sbholler 23946320Sbholler .balign 16 23956320SbhollerL(bk_align): 23966320Sbholler # only align if len > 8 23976320Sbholler cmp $8,%r8 23986320Sbholler jle L(bk_qw_aligned) 23996320Sbholler test $0x1,%rcx 24006320Sbholler je L(bk_tst2) 24016320Sbholler dec %rcx 24026320Sbholler dec %rdx 24036320Sbholler dec %r8 24046320Sbholler mov (%rdx),%r9b 24056320Sbholler mov %r9b,(%rcx) 24066320Sbholler 24076320SbhollerL(bk_tst2): 24086320Sbholler test $0x2,%rcx 24096320Sbholler je L(bk_tst3) 24106320Sbholler 24116320SbhollerL(bk_got2): 24126320Sbholler sub $0x2,%rcx 24136320Sbholler sub $0x2,%rdx 24146320Sbholler sub $0x2,%r8 24156320Sbholler movzwq (%rdx),%r9 24166320Sbholler mov %r9w,(%rcx) 24176320Sbholler 24186320SbhollerL(bk_tst3): 24196320Sbholler test $0x4,%rcx 24206320Sbholler je L(bk_qw_aligned) 24216320Sbholler 24226320SbhollerL(bk_got3): 24236320Sbholler sub $0x4,%rcx 24246320Sbholler sub $0x4,%rdx 24256320Sbholler sub $0x4,%r8 24266320Sbholler mov (%rdx),%r9d 24276320Sbholler mov %r9d,(%rcx) 24286320Sbholler jmp L(bk_qw_aligned) 24296320Sbholler 24306320Sbholler .balign 16 24316320SbhollerL(bk_ck_sse2_alignment): 24326320Sbholler cmpl $NO_SSE,.memops_method(%rip) 24336320Sbholler je L(bk_use_rep) 24346320Sbholler # check alignment of last byte 24356320Sbholler test $0xf,%rcx 24366320Sbholler jz L(bk_sse2_cpy) 24376320Sbholler 24386320SbhollerL(bk_sse2_align): 24396320Sbholler # only here if already aligned on at least a qword bndry 24406320Sbholler sub $0x8,%rcx 24416320Sbholler sub $0x8,%rdx 24426320Sbholler sub $0x8,%r8 24436320Sbholler mov (%rdx),%r9 24446320Sbholler mov %r9,(%rcx) 24456320Sbholler #jmp L(bk_sse2_cpy) 24466320Sbholler 24476320Sbholler .balign 16 24486320SbhollerL(bk_sse2_cpy): 24496320Sbholler sub $0x80,%rcx # 128 24506320Sbholler sub $0x80,%rdx 24516320Sbholler movdqu 0x70(%rdx),%xmm3 24526320Sbholler movdqu 0x60(%rdx),%xmm2 24536320Sbholler movdqa %xmm3,0x70(%rcx) 24546320Sbholler movdqa %xmm2,0x60(%rcx) 24556320Sbholler sub $0x80,%r8 24566320Sbholler movdqu 0x50(%rdx),%xmm1 24576320Sbholler movdqu 0x40(%rdx),%xmm0 24586320Sbholler movdqa %xmm1,0x50(%rcx) 24596320Sbholler movdqa %xmm0,0x40(%rcx) 24606320Sbholler 24616320Sbholler cmp $0x80,%r8 24626320Sbholler movdqu 0x30(%rdx),%xmm3 24636320Sbholler movdqu 0x20(%rdx),%xmm2 24646320Sbholler movdqa %xmm3,0x30(%rcx) 24656320Sbholler movdqa %xmm2,0x20(%rcx) 24666320Sbholler movdqu 0x10(%rdx),%xmm1 24676320Sbholler movdqu (%rdx),%xmm0 24686320Sbholler movdqa %xmm1,0x10(%rcx) 24696320Sbholler movdqa %xmm0,(%rcx) 24706320Sbholler jge L(bk_sse2_cpy) 24716320Sbholler 24726320SbhollerL(bk_sse2_cpy_end): 24736320Sbholler lea L(bkPxQx)(%rip),%r10 24746320Sbholler sub %r8,%rdx 24756320Sbholler sub %r8,%rcx 24766320Sbholler movslq (%r10,%r8,4),%r9 24776320Sbholler lea (%r9,%r10,1),%r10 24786320Sbholler jmpq *%r10 24796320Sbholler 24806320Sbholler .balign 16 24816320SbhollerL(bk_use_rep): 24826320Sbholler xchg %rcx,%r9 24836320Sbholler mov %rdx,%rsi # source 24846320Sbholler mov %r9,%rdi # destination 24856320Sbholler mov %r8,%rcx # count 24866320Sbholler sub $8,%rsi 24876320Sbholler sub $8,%rdi 24886320Sbholler shr $3,%rcx 24896320Sbholler std # reverse direction 24906320Sbholler rep 24916320Sbholler movsq 24926320Sbholler cld # reset direction flag 24936320Sbholler 24946320Sbholler xchg %rcx,%r9 24956320Sbholler lea L(bkPxQx)(%rip),%r10 24966320Sbholler sub %r8,%rdx 24976320Sbholler sub %r8,%rcx 24986320Sbholler andq $7,%r8 # remainder 24996320Sbholler jz 2f 25006320Sbholler movslq (%r10,%r8,4),%r9 25016320Sbholler lea (%r9,%r10,1),%r10 25026320Sbholler jmpq *%r10 25036320Sbholler2: 25046320Sbholler ret 25056320Sbholler 25066320Sbholler .balign 16 25076320SbhollerL(bkP0QI): 25086320Sbholler mov 0x88(%rdx),%r10 25096320Sbholler mov %r10,0x88(%rcx) 25106320SbhollerL(bkP0QH): 25116320Sbholler mov 0x80(%rdx),%r10 25126320Sbholler mov %r10,0x80(%rcx) 25136320SbhollerL(bkP0QG): 25146320Sbholler mov 0x78(%rdx),%r9 25156320Sbholler mov %r9,0x78(%rcx) 25166320SbhollerL(bkP0QF): 25176320Sbholler mov 0x70(%rdx),%r11 25186320Sbholler mov %r11,0x70(%rcx) 25196320SbhollerL(bkP0QE): 25206320Sbholler mov 0x68(%rdx),%r10 25216320Sbholler mov %r10,0x68(%rcx) 25226320SbhollerL(bkP0QD): 25236320Sbholler mov 0x60(%rdx),%r9 25246320Sbholler mov %r9,0x60(%rcx) 25256320SbhollerL(bkP0QC): 25266320Sbholler mov 0x58(%rdx),%r11 25276320Sbholler mov %r11,0x58(%rcx) 25286320SbhollerL(bkP0QB): 25296320Sbholler mov 0x50(%rdx),%r10 25306320Sbholler mov %r10,0x50(%rcx) 25316320SbhollerL(bkP0QA): 25326320Sbholler mov 0x48(%rdx),%r9 25336320Sbholler mov %r9,0x48(%rcx) 25346320SbhollerL(bkP0Q9): 25356320Sbholler mov 0x40(%rdx),%r11 25366320Sbholler mov %r11,0x40(%rcx) 25376320SbhollerL(bkP0Q8): 25386320Sbholler mov 0x38(%rdx),%r10 25396320Sbholler mov %r10,0x38(%rcx) 25406320SbhollerL(bkP0Q7): 25416320Sbholler mov 0x30(%rdx),%r9 25426320Sbholler mov %r9,0x30(%rcx) 25436320SbhollerL(bkP0Q6): 25446320Sbholler mov 0x28(%rdx),%r11 25456320Sbholler mov %r11,0x28(%rcx) 25466320SbhollerL(bkP0Q5): 25476320Sbholler mov 0x20(%rdx),%r10 25486320Sbholler mov %r10,0x20(%rcx) 25496320SbhollerL(bkP0Q4): 25506320Sbholler mov 0x18(%rdx),%r9 25516320Sbholler mov %r9,0x18(%rcx) 25526320SbhollerL(bkP0Q3): 25536320Sbholler mov 0x10(%rdx),%r11 25546320Sbholler mov %r11,0x10(%rcx) 25556320SbhollerL(bkP0Q2): 25566320Sbholler mov 0x8(%rdx),%r10 25576320Sbholler mov %r10,0x8(%rcx) 25586320SbhollerL(bkP0Q1): 25596320Sbholler mov (%rdx),%r9 25606320Sbholler mov %r9,(%rcx) 25616320SbhollerL(bkP0Q0): 25626320Sbholler ret 25636320Sbholler 25646320Sbholler .balign 16 25656320SbhollerL(bkP1QI): 25666320Sbholler mov 0x89(%rdx),%r10 25676320Sbholler mov %r10,0x89(%rcx) 25686320SbhollerL(bkP1QH): 25696320Sbholler mov 0x81(%rdx),%r11 25706320Sbholler mov %r11,0x81(%rcx) 25716320SbhollerL(bkP1QG): 25726320Sbholler mov 0x79(%rdx),%r10 25736320Sbholler mov %r10,0x79(%rcx) 25746320SbhollerL(bkP1QF): 25756320Sbholler mov 0x71(%rdx),%r9 25766320Sbholler mov %r9,0x71(%rcx) 25776320SbhollerL(bkP1QE): 25786320Sbholler mov 0x69(%rdx),%r11 25796320Sbholler mov %r11,0x69(%rcx) 25806320SbhollerL(bkP1QD): 25816320Sbholler mov 0x61(%rdx),%r10 25826320Sbholler mov %r10,0x61(%rcx) 25836320SbhollerL(bkP1QC): 25846320Sbholler mov 0x59(%rdx),%r9 25856320Sbholler mov %r9,0x59(%rcx) 25866320SbhollerL(bkP1QB): 25876320Sbholler mov 0x51(%rdx),%r11 25886320Sbholler mov %r11,0x51(%rcx) 25896320SbhollerL(bkP1QA): 25906320Sbholler mov 0x49(%rdx),%r10 25916320Sbholler mov %r10,0x49(%rcx) 25926320SbhollerL(bkP1Q9): 25936320Sbholler mov 0x41(%rdx),%r9 25946320Sbholler mov %r9,0x41(%rcx) 25956320SbhollerL(bkP1Q8): 25966320Sbholler mov 0x39(%rdx),%r11 25976320Sbholler mov %r11,0x39(%rcx) 25986320SbhollerL(bkP1Q7): 25996320Sbholler mov 0x31(%rdx),%r10 26006320Sbholler mov %r10,0x31(%rcx) 26016320SbhollerL(bkP1Q6): 26026320Sbholler mov 0x29(%rdx),%r9 26036320Sbholler mov %r9,0x29(%rcx) 26046320SbhollerL(bkP1Q5): 26056320Sbholler mov 0x21(%rdx),%r11 26066320Sbholler mov %r11,0x21(%rcx) 26076320SbhollerL(bkP1Q4): 26086320Sbholler mov 0x19(%rdx),%r10 26096320Sbholler mov %r10,0x19(%rcx) 26106320SbhollerL(bkP1Q3): 26116320Sbholler mov 0x11(%rdx),%r9 26126320Sbholler mov %r9,0x11(%rcx) 26136320SbhollerL(bkP1Q2): 26146320Sbholler mov 0x9(%rdx),%r11 26156320Sbholler mov %r11,0x9(%rcx) 26166320SbhollerL(bkP1Q1): 26176320Sbholler mov 0x1(%rdx),%r10 26186320Sbholler mov %r10,0x1(%rcx) 26196320SbhollerL(bkP1Q0): 26206320Sbholler mov (%rdx),%r9b 26216320Sbholler mov %r9b,(%rcx) 26226320Sbholler ret 26236320Sbholler 26246320Sbholler .balign 16 26256320SbhollerL(bkP2QI): 26266320Sbholler mov 0x8a(%rdx),%r10 26276320Sbholler mov %r10,0x8a(%rcx) 26286320SbhollerL(bkP2QH): 26296320Sbholler mov 0x82(%rdx),%r11 26306320Sbholler mov %r11,0x82(%rcx) 26316320SbhollerL(bkP2QG): 26326320Sbholler mov 0x7a(%rdx),%r10 26336320Sbholler mov %r10,0x7a(%rcx) 26346320SbhollerL(bkP2QF): 26356320Sbholler mov 0x72(%rdx),%r9 26366320Sbholler mov %r9,0x72(%rcx) 26376320SbhollerL(bkP2QE): 26386320Sbholler mov 0x6a(%rdx),%r11 26396320Sbholler mov %r11,0x6a(%rcx) 26406320SbhollerL(bkP2QD): 26416320Sbholler mov 0x62(%rdx),%r10 26426320Sbholler mov %r10,0x62(%rcx) 26436320SbhollerL(bkP2QC): 26446320Sbholler mov 0x5a(%rdx),%r9 26456320Sbholler mov %r9,0x5a(%rcx) 26466320SbhollerL(bkP2QB): 26476320Sbholler mov 0x52(%rdx),%r11 26486320Sbholler mov %r11,0x52(%rcx) 26496320SbhollerL(bkP2QA): 26506320Sbholler mov 0x4a(%rdx),%r10 26516320Sbholler mov %r10,0x4a(%rcx) 26526320SbhollerL(bkP2Q9): 26536320Sbholler mov 0x42(%rdx),%r9 26546320Sbholler mov %r9,0x42(%rcx) 26556320SbhollerL(bkP2Q8): 26566320Sbholler mov 0x3a(%rdx),%r11 26576320Sbholler mov %r11,0x3a(%rcx) 26586320SbhollerL(bkP2Q7): 26596320Sbholler mov 0x32(%rdx),%r10 26606320Sbholler mov %r10,0x32(%rcx) 26616320SbhollerL(bkP2Q6): 26626320Sbholler mov 0x2a(%rdx),%r9 26636320Sbholler mov %r9,0x2a(%rcx) 26646320SbhollerL(bkP2Q5): 26656320Sbholler mov 0x22(%rdx),%r11 26666320Sbholler mov %r11,0x22(%rcx) 26676320SbhollerL(bkP2Q4): 26686320Sbholler mov 0x1a(%rdx),%r10 26696320Sbholler mov %r10,0x1a(%rcx) 26706320SbhollerL(bkP2Q3): 26716320Sbholler mov 0x12(%rdx),%r9 26726320Sbholler mov %r9,0x12(%rcx) 26736320SbhollerL(bkP2Q2): 26746320Sbholler mov 0xa(%rdx),%r11 26756320Sbholler mov %r11,0xa(%rcx) 26766320SbhollerL(bkP2Q1): 26776320Sbholler mov 0x2(%rdx),%r10 26786320Sbholler mov %r10,0x2(%rcx) 26796320SbhollerL(bkP2Q0): 26806320Sbholler mov (%rdx),%r9w 26816320Sbholler mov %r9w,(%rcx) 26826320Sbholler ret 26836320Sbholler 26846320Sbholler .balign 16 26856320SbhollerL(bkP3QI): 26866320Sbholler mov 0x8b(%rdx),%r10 26876320Sbholler mov %r10,0x8b(%rcx) 26886320SbhollerL(bkP3QH): 26896320Sbholler mov 0x83(%rdx),%r11 26906320Sbholler mov %r11,0x83(%rcx) 26916320SbhollerL(bkP3QG): 26926320Sbholler mov 0x7b(%rdx),%r10 26936320Sbholler mov %r10,0x7b(%rcx) 26946320SbhollerL(bkP3QF): 26956320Sbholler mov 0x73(%rdx),%r9 26966320Sbholler mov %r9,0x73(%rcx) 26976320SbhollerL(bkP3QE): 26986320Sbholler mov 0x6b(%rdx),%r11 26996320Sbholler mov %r11,0x6b(%rcx) 27006320SbhollerL(bkP3QD): 27016320Sbholler mov 0x63(%rdx),%r10 27026320Sbholler mov %r10,0x63(%rcx) 27036320SbhollerL(bkP3QC): 27046320Sbholler mov 0x5b(%rdx),%r9 27056320Sbholler mov %r9,0x5b(%rcx) 27066320SbhollerL(bkP3QB): 27076320Sbholler mov 0x53(%rdx),%r11 27086320Sbholler mov %r11,0x53(%rcx) 27096320SbhollerL(bkP3QA): 27106320Sbholler mov 0x4b(%rdx),%r10 27116320Sbholler mov %r10,0x4b(%rcx) 27126320SbhollerL(bkP3Q9): 27136320Sbholler mov 0x43(%rdx),%r9 27146320Sbholler mov %r9,0x43(%rcx) 27156320SbhollerL(bkP3Q8): 27166320Sbholler mov 0x3b(%rdx),%r11 27176320Sbholler mov %r11,0x3b(%rcx) 27186320SbhollerL(bkP3Q7): 27196320Sbholler mov 0x33(%rdx),%r10 27206320Sbholler mov %r10,0x33(%rcx) 27216320SbhollerL(bkP3Q6): 27226320Sbholler mov 0x2b(%rdx),%r9 27236320Sbholler mov %r9,0x2b(%rcx) 27246320SbhollerL(bkP3Q5): 27256320Sbholler mov 0x23(%rdx),%r11 27266320Sbholler mov %r11,0x23(%rcx) 27276320SbhollerL(bkP3Q4): 27286320Sbholler mov 0x1b(%rdx),%r10 27296320Sbholler mov %r10,0x1b(%rcx) 27306320SbhollerL(bkP3Q3): 27316320Sbholler mov 0x13(%rdx),%r9 27326320Sbholler mov %r9,0x13(%rcx) 27336320SbhollerL(bkP3Q2): 27346320Sbholler mov 0xb(%rdx),%r11 27356320Sbholler mov %r11,0xb(%rcx) 27366320SbhollerL(bkP3Q1): 27376320Sbholler mov 0x3(%rdx),%r10 27386320Sbholler mov %r10,0x3(%rcx) 27396320SbhollerL(bkP3Q0): # trailing loads/stores do all their loads 1st, then do the stores 27406320Sbholler mov 0x1(%rdx),%r9w 27416320Sbholler mov %r9w,0x1(%rcx) 27426320Sbholler mov (%rdx),%r10b 27436320Sbholler mov %r10b,(%rcx) 27446320Sbholler ret 27456320Sbholler 27466320Sbholler .balign 16 27476320SbhollerL(bkP4QI): 27486320Sbholler mov 0x8c(%rdx),%r10 27496320Sbholler mov %r10,0x8c(%rcx) 27506320SbhollerL(bkP4QH): 27516320Sbholler mov 0x84(%rdx),%r11 27526320Sbholler mov %r11,0x84(%rcx) 27536320SbhollerL(bkP4QG): 27546320Sbholler mov 0x7c(%rdx),%r10 27556320Sbholler mov %r10,0x7c(%rcx) 27566320SbhollerL(bkP4QF): 27576320Sbholler mov 0x74(%rdx),%r9 27586320Sbholler mov %r9,0x74(%rcx) 27596320SbhollerL(bkP4QE): 27606320Sbholler mov 0x6c(%rdx),%r11 27616320Sbholler mov %r11,0x6c(%rcx) 27626320SbhollerL(bkP4QD): 27636320Sbholler mov 0x64(%rdx),%r10 27646320Sbholler mov %r10,0x64(%rcx) 27656320SbhollerL(bkP4QC): 27666320Sbholler mov 0x5c(%rdx),%r9 27676320Sbholler mov %r9,0x5c(%rcx) 27686320SbhollerL(bkP4QB): 27696320Sbholler mov 0x54(%rdx),%r11 27706320Sbholler mov %r11,0x54(%rcx) 27716320SbhollerL(bkP4QA): 27726320Sbholler mov 0x4c(%rdx),%r10 27736320Sbholler mov %r10,0x4c(%rcx) 27746320SbhollerL(bkP4Q9): 27756320Sbholler mov 0x44(%rdx),%r9 27766320Sbholler mov %r9,0x44(%rcx) 27776320SbhollerL(bkP4Q8): 27786320Sbholler mov 0x3c(%rdx),%r11 27796320Sbholler mov %r11,0x3c(%rcx) 27806320SbhollerL(bkP4Q7): 27816320Sbholler mov 0x34(%rdx),%r10 27826320Sbholler mov %r10,0x34(%rcx) 27836320SbhollerL(bkP4Q6): 27846320Sbholler mov 0x2c(%rdx),%r9 27856320Sbholler mov %r9,0x2c(%rcx) 27866320SbhollerL(bkP4Q5): 27876320Sbholler mov 0x24(%rdx),%r11 27886320Sbholler mov %r11,0x24(%rcx) 27896320SbhollerL(bkP4Q4): 27906320Sbholler mov 0x1c(%rdx),%r10 27916320Sbholler mov %r10,0x1c(%rcx) 27926320SbhollerL(bkP4Q3): 27936320Sbholler mov 0x14(%rdx),%r9 27946320Sbholler mov %r9,0x14(%rcx) 27956320SbhollerL(bkP4Q2): 27966320Sbholler mov 0xc(%rdx),%r11 27976320Sbholler mov %r11,0xc(%rcx) 27986320SbhollerL(bkP4Q1): 27996320Sbholler mov 0x4(%rdx),%r10 28006320Sbholler mov %r10,0x4(%rcx) 28016320SbhollerL(bkP4Q0): 28026320Sbholler mov (%rdx),%r9d 28036320Sbholler mov %r9d,(%rcx) 28046320Sbholler ret 28056320Sbholler 28066320Sbholler .balign 16 28076320SbhollerL(bkP5QI): 28086320Sbholler mov 0x8d(%rdx),%r10 28096320Sbholler mov %r10,0x8d(%rcx) 28106320SbhollerL(bkP5QH): 28116320Sbholler mov 0x85(%rdx),%r9 28126320Sbholler mov %r9,0x85(%rcx) 28136320SbhollerL(bkP5QG): 28146320Sbholler mov 0x7d(%rdx),%r11 28156320Sbholler mov %r11,0x7d(%rcx) 28166320SbhollerL(bkP5QF): 28176320Sbholler mov 0x75(%rdx),%r10 28186320Sbholler mov %r10,0x75(%rcx) 28196320SbhollerL(bkP5QE): 28206320Sbholler mov 0x6d(%rdx),%r9 28216320Sbholler mov %r9,0x6d(%rcx) 28226320SbhollerL(bkP5QD): 28236320Sbholler mov 0x65(%rdx),%r11 28246320Sbholler mov %r11,0x65(%rcx) 28256320SbhollerL(bkP5QC): 28266320Sbholler mov 0x5d(%rdx),%r10 28276320Sbholler mov %r10,0x5d(%rcx) 28286320SbhollerL(bkP5QB): 28296320Sbholler mov 0x55(%rdx),%r9 28306320Sbholler mov %r9,0x55(%rcx) 28316320SbhollerL(bkP5QA): 28326320Sbholler mov 0x4d(%rdx),%r11 28336320Sbholler mov %r11,0x4d(%rcx) 28346320SbhollerL(bkP5Q9): 28356320Sbholler mov 0x45(%rdx),%r10 28366320Sbholler mov %r10,0x45(%rcx) 28376320SbhollerL(bkP5Q8): 28386320Sbholler mov 0x3d(%rdx),%r9 28396320Sbholler mov %r9,0x3d(%rcx) 28406320SbhollerL(bkP5Q7): 28416320Sbholler mov 0x35(%rdx),%r11 28426320Sbholler mov %r11,0x35(%rcx) 28436320SbhollerL(bkP5Q6): 28446320Sbholler mov 0x2d(%rdx),%r10 28456320Sbholler mov %r10,0x2d(%rcx) 28466320SbhollerL(bkP5Q5): 28476320Sbholler mov 0x25(%rdx),%r9 28486320Sbholler mov %r9,0x25(%rcx) 28496320SbhollerL(bkP5Q4): 28506320Sbholler mov 0x1d(%rdx),%r11 28516320Sbholler mov %r11,0x1d(%rcx) 28526320SbhollerL(bkP5Q3): 28536320Sbholler mov 0x15(%rdx),%r10 28546320Sbholler mov %r10,0x15(%rcx) 28556320SbhollerL(bkP5Q2): 28566320Sbholler mov 0xd(%rdx),%r9 28576320Sbholler mov %r9,0xd(%rcx) 28586320SbhollerL(bkP5Q1): 28596320Sbholler mov 0x5(%rdx),%r11 28606320Sbholler mov %r11,0x5(%rcx) 28616320SbhollerL(bkP5Q0): # trailing loads/stores do all their loads 1st, then do the stores 28626320Sbholler mov 0x1(%rdx),%r9d 28636320Sbholler mov %r9d,0x1(%rcx) 28646320Sbholler mov (%rdx),%r10b 28656320Sbholler mov %r10b,(%rcx) 28666320Sbholler ret 28676320Sbholler 28686320Sbholler .balign 16 28696320SbhollerL(bkP6QI): 28706320Sbholler mov 0x8e(%rdx),%r10 28716320Sbholler mov %r10,0x8e(%rcx) 28726320SbhollerL(bkP6QH): 28736320Sbholler mov 0x86(%rdx),%r11 28746320Sbholler mov %r11,0x86(%rcx) 28756320SbhollerL(bkP6QG): 28766320Sbholler mov 0x7e(%rdx),%r10 28776320Sbholler mov %r10,0x7e(%rcx) 28786320SbhollerL(bkP6QF): 28796320Sbholler mov 0x76(%rdx),%r9 28806320Sbholler mov %r9,0x76(%rcx) 28816320SbhollerL(bkP6QE): 28826320Sbholler mov 0x6e(%rdx),%r11 28836320Sbholler mov %r11,0x6e(%rcx) 28846320SbhollerL(bkP6QD): 28856320Sbholler mov 0x66(%rdx),%r10 28866320Sbholler mov %r10,0x66(%rcx) 28876320SbhollerL(bkP6QC): 28886320Sbholler mov 0x5e(%rdx),%r9 28896320Sbholler mov %r9,0x5e(%rcx) 28906320SbhollerL(bkP6QB): 28916320Sbholler mov 0x56(%rdx),%r11 28926320Sbholler mov %r11,0x56(%rcx) 28936320SbhollerL(bkP6QA): 28946320Sbholler mov 0x4e(%rdx),%r10 28956320Sbholler mov %r10,0x4e(%rcx) 28966320SbhollerL(bkP6Q9): 28976320Sbholler mov 0x46(%rdx),%r9 28986320Sbholler mov %r9,0x46(%rcx) 28996320SbhollerL(bkP6Q8): 29006320Sbholler mov 0x3e(%rdx),%r11 29016320Sbholler mov %r11,0x3e(%rcx) 29026320SbhollerL(bkP6Q7): 29036320Sbholler mov 0x36(%rdx),%r10 29046320Sbholler mov %r10,0x36(%rcx) 29056320SbhollerL(bkP6Q6): 29066320Sbholler mov 0x2e(%rdx),%r9 29076320Sbholler mov %r9,0x2e(%rcx) 29086320SbhollerL(bkP6Q5): 29096320Sbholler mov 0x26(%rdx),%r11 29106320Sbholler mov %r11,0x26(%rcx) 29116320SbhollerL(bkP6Q4): 29126320Sbholler mov 0x1e(%rdx),%r10 29136320Sbholler mov %r10,0x1e(%rcx) 29146320SbhollerL(bkP6Q3): 29156320Sbholler mov 0x16(%rdx),%r9 29166320Sbholler mov %r9,0x16(%rcx) 29176320SbhollerL(bkP6Q2): 29186320Sbholler mov 0xe(%rdx),%r11 29196320Sbholler mov %r11,0xe(%rcx) 29206320SbhollerL(bkP6Q1): 29216320Sbholler mov 0x6(%rdx),%r10 29226320Sbholler mov %r10,0x6(%rcx) 29236320SbhollerL(bkP6Q0): # trailing loads/stores do all their loads 1st, then do the stores 29246320Sbholler mov 0x2(%rdx),%r9d 29256320Sbholler mov %r9d,0x2(%rcx) 29266320Sbholler mov (%rdx),%r10w 29276320Sbholler mov %r10w,(%rcx) 29286320Sbholler ret 29296320Sbholler 29306320Sbholler .balign 16 29316320SbhollerL(bkP7QI): 29326320Sbholler mov 0x8f(%rdx),%r10 29336320Sbholler mov %r10,0x8f(%rcx) 29346320SbhollerL(bkP7QH): 29356320Sbholler mov 0x87(%rdx),%r11 29366320Sbholler mov %r11,0x87(%rcx) 29376320SbhollerL(bkP7QG): 29386320Sbholler mov 0x7f(%rdx),%r10 29396320Sbholler mov %r10,0x7f(%rcx) 29406320SbhollerL(bkP7QF): 29416320Sbholler mov 0x77(%rdx),%r9 29426320Sbholler mov %r9,0x77(%rcx) 29436320SbhollerL(bkP7QE): 29446320Sbholler mov 0x6f(%rdx),%r11 29456320Sbholler mov %r11,0x6f(%rcx) 29466320SbhollerL(bkP7QD): 29476320Sbholler mov 0x67(%rdx),%r10 29486320Sbholler mov %r10,0x67(%rcx) 29496320SbhollerL(bkP7QC): 29506320Sbholler mov 0x5f(%rdx),%r9 29516320Sbholler mov %r9,0x5f(%rcx) 29526320SbhollerL(bkP7QB): 29536320Sbholler mov 0x57(%rdx),%r11 29546320Sbholler mov %r11,0x57(%rcx) 29556320SbhollerL(bkP7QA): 29566320Sbholler mov 0x4f(%rdx),%r10 29576320Sbholler mov %r10,0x4f(%rcx) 29586320SbhollerL(bkP7Q9): 29596320Sbholler mov 0x47(%rdx),%r9 29606320Sbholler mov %r9,0x47(%rcx) 29616320SbhollerL(bkP7Q8): 29626320Sbholler mov 0x3f(%rdx),%r11 29636320Sbholler mov %r11,0x3f(%rcx) 29646320SbhollerL(bkP7Q7): 29656320Sbholler mov 0x37(%rdx),%r10 29666320Sbholler mov %r10,0x37(%rcx) 29676320SbhollerL(bkP7Q6): 29686320Sbholler mov 0x2f(%rdx),%r9 29696320Sbholler mov %r9,0x2f(%rcx) 29706320SbhollerL(bkP7Q5): 29716320Sbholler mov 0x27(%rdx),%r11 29726320Sbholler mov %r11,0x27(%rcx) 29736320SbhollerL(bkP7Q4): 29746320Sbholler mov 0x1f(%rdx),%r10 29756320Sbholler mov %r10,0x1f(%rcx) 29766320SbhollerL(bkP7Q3): 29776320Sbholler mov 0x17(%rdx),%r9 29786320Sbholler mov %r9,0x17(%rcx) 29796320SbhollerL(bkP7Q2): 29806320Sbholler mov 0xf(%rdx),%r11 29816320Sbholler mov %r11,0xf(%rcx) 29826320SbhollerL(bkP7Q1): 29836320Sbholler mov 0x7(%rdx),%r10 29846320Sbholler mov %r10,0x7(%rcx) 29856320SbhollerL(bkP7Q0): # trailing loads/stores do all their loads 1st, then do the stores 29866320Sbholler mov 0x3(%rdx),%r9d 29876320Sbholler mov %r9d,0x3(%rcx) 29886320Sbholler mov 0x1(%rdx),%r10w 29896320Sbholler mov %r10w,0x1(%rcx) 29906320Sbholler mov (%rdx),%r11b 29916320Sbholler mov %r11b,(%rcx) 29926320Sbholler ret 29936320Sbholler 29946320Sbholler .balign 16 29956320SbhollerL(bkPxQx): .int L(bkP0Q0)-L(bkPxQx) 29966320Sbholler .int L(bkP1Q0)-L(bkPxQx) 29976320Sbholler .int L(bkP2Q0)-L(bkPxQx) 29986320Sbholler .int L(bkP3Q0)-L(bkPxQx) 29996320Sbholler .int L(bkP4Q0)-L(bkPxQx) 30006320Sbholler .int L(bkP5Q0)-L(bkPxQx) 30016320Sbholler .int L(bkP6Q0)-L(bkPxQx) 30026320Sbholler .int L(bkP7Q0)-L(bkPxQx) 30036320Sbholler 30046320Sbholler .int L(bkP0Q1)-L(bkPxQx) 30056320Sbholler .int L(bkP1Q1)-L(bkPxQx) 30066320Sbholler .int L(bkP2Q1)-L(bkPxQx) 30076320Sbholler .int L(bkP3Q1)-L(bkPxQx) 30086320Sbholler .int L(bkP4Q1)-L(bkPxQx) 30096320Sbholler .int L(bkP5Q1)-L(bkPxQx) 30106320Sbholler .int L(bkP6Q1)-L(bkPxQx) 30116320Sbholler .int L(bkP7Q1)-L(bkPxQx) 30126320Sbholler 30136320Sbholler .int L(bkP0Q2)-L(bkPxQx) 30146320Sbholler .int L(bkP1Q2)-L(bkPxQx) 30156320Sbholler .int L(bkP2Q2)-L(bkPxQx) 30166320Sbholler .int L(bkP3Q2)-L(bkPxQx) 30176320Sbholler .int L(bkP4Q2)-L(bkPxQx) 30186320Sbholler .int L(bkP5Q2)-L(bkPxQx) 30196320Sbholler .int L(bkP6Q2)-L(bkPxQx) 30206320Sbholler .int L(bkP7Q2)-L(bkPxQx) 30216320Sbholler 30226320Sbholler .int L(bkP0Q3)-L(bkPxQx) 30236320Sbholler .int L(bkP1Q3)-L(bkPxQx) 30246320Sbholler .int L(bkP2Q3)-L(bkPxQx) 30256320Sbholler .int L(bkP3Q3)-L(bkPxQx) 30266320Sbholler .int L(bkP4Q3)-L(bkPxQx) 30276320Sbholler .int L(bkP5Q3)-L(bkPxQx) 30286320Sbholler .int L(bkP6Q3)-L(bkPxQx) 30296320Sbholler .int L(bkP7Q3)-L(bkPxQx) 30306320Sbholler 30316320Sbholler .int L(bkP0Q4)-L(bkPxQx) 30326320Sbholler .int L(bkP1Q4)-L(bkPxQx) 30336320Sbholler .int L(bkP2Q4)-L(bkPxQx) 30346320Sbholler .int L(bkP3Q4)-L(bkPxQx) 30356320Sbholler .int L(bkP4Q4)-L(bkPxQx) 30366320Sbholler .int L(bkP5Q4)-L(bkPxQx) 30376320Sbholler .int L(bkP6Q4)-L(bkPxQx) 30386320Sbholler .int L(bkP7Q4)-L(bkPxQx) 30396320Sbholler 30406320Sbholler .int L(bkP0Q5)-L(bkPxQx) 30416320Sbholler .int L(bkP1Q5)-L(bkPxQx) 30426320Sbholler .int L(bkP2Q5)-L(bkPxQx) 30436320Sbholler .int L(bkP3Q5)-L(bkPxQx) 30446320Sbholler .int L(bkP4Q5)-L(bkPxQx) 30456320Sbholler .int L(bkP5Q5)-L(bkPxQx) 30466320Sbholler .int L(bkP6Q5)-L(bkPxQx) 30476320Sbholler .int L(bkP7Q5)-L(bkPxQx) 30486320Sbholler 30496320Sbholler .int L(bkP0Q6)-L(bkPxQx) 30506320Sbholler .int L(bkP1Q6)-L(bkPxQx) 30516320Sbholler .int L(bkP2Q6)-L(bkPxQx) 30526320Sbholler .int L(bkP3Q6)-L(bkPxQx) 30536320Sbholler .int L(bkP4Q6)-L(bkPxQx) 30546320Sbholler .int L(bkP5Q6)-L(bkPxQx) 30556320Sbholler .int L(bkP6Q6)-L(bkPxQx) 30566320Sbholler .int L(bkP7Q6)-L(bkPxQx) 30576320Sbholler 30586320Sbholler .int L(bkP0Q7)-L(bkPxQx) 30596320Sbholler .int L(bkP1Q7)-L(bkPxQx) 30606320Sbholler .int L(bkP2Q7)-L(bkPxQx) 30616320Sbholler .int L(bkP3Q7)-L(bkPxQx) 30626320Sbholler .int L(bkP4Q7)-L(bkPxQx) 30636320Sbholler .int L(bkP5Q7)-L(bkPxQx) 30646320Sbholler .int L(bkP6Q7)-L(bkPxQx) 30656320Sbholler .int L(bkP7Q7)-L(bkPxQx) 30666320Sbholler 30676320Sbholler .int L(bkP0Q8)-L(bkPxQx) 30686320Sbholler .int L(bkP1Q8)-L(bkPxQx) 30696320Sbholler .int L(bkP2Q8)-L(bkPxQx) 30706320Sbholler .int L(bkP3Q8)-L(bkPxQx) 30716320Sbholler .int L(bkP4Q8)-L(bkPxQx) 30726320Sbholler .int L(bkP5Q8)-L(bkPxQx) 30736320Sbholler .int L(bkP6Q8)-L(bkPxQx) 30746320Sbholler .int L(bkP7Q8)-L(bkPxQx) 30756320Sbholler 30766320Sbholler .int L(bkP0Q9)-L(bkPxQx) 30776320Sbholler .int L(bkP1Q9)-L(bkPxQx) 30786320Sbholler .int L(bkP2Q9)-L(bkPxQx) 30796320Sbholler .int L(bkP3Q9)-L(bkPxQx) 30806320Sbholler .int L(bkP4Q9)-L(bkPxQx) 30816320Sbholler .int L(bkP5Q9)-L(bkPxQx) 30826320Sbholler .int L(bkP6Q9)-L(bkPxQx) 30836320Sbholler .int L(bkP7Q9)-L(bkPxQx) 30846320Sbholler 30856320Sbholler .int L(bkP0QA)-L(bkPxQx) 30866320Sbholler .int L(bkP1QA)-L(bkPxQx) 30876320Sbholler .int L(bkP2QA)-L(bkPxQx) 30886320Sbholler .int L(bkP3QA)-L(bkPxQx) 30896320Sbholler .int L(bkP4QA)-L(bkPxQx) 30906320Sbholler .int L(bkP5QA)-L(bkPxQx) 30916320Sbholler .int L(bkP6QA)-L(bkPxQx) 30926320Sbholler .int L(bkP7QA)-L(bkPxQx) 30936320Sbholler 30946320Sbholler .int L(bkP0QB)-L(bkPxQx) 30956320Sbholler .int L(bkP1QB)-L(bkPxQx) 30966320Sbholler .int L(bkP2QB)-L(bkPxQx) 30976320Sbholler .int L(bkP3QB)-L(bkPxQx) 30986320Sbholler .int L(bkP4QB)-L(bkPxQx) 30996320Sbholler .int L(bkP5QB)-L(bkPxQx) 31006320Sbholler .int L(bkP6QB)-L(bkPxQx) 31016320Sbholler .int L(bkP7QB)-L(bkPxQx) 31026320Sbholler 31036320Sbholler .int L(bkP0QC)-L(bkPxQx) 31046320Sbholler .int L(bkP1QC)-L(bkPxQx) 31056320Sbholler .int L(bkP2QC)-L(bkPxQx) 31066320Sbholler .int L(bkP3QC)-L(bkPxQx) 31076320Sbholler .int L(bkP4QC)-L(bkPxQx) 31086320Sbholler .int L(bkP5QC)-L(bkPxQx) 31096320Sbholler .int L(bkP6QC)-L(bkPxQx) 31106320Sbholler .int L(bkP7QC)-L(bkPxQx) 31116320Sbholler 31126320Sbholler .int L(bkP0QD)-L(bkPxQx) 31136320Sbholler .int L(bkP1QD)-L(bkPxQx) 31146320Sbholler .int L(bkP2QD)-L(bkPxQx) 31156320Sbholler .int L(bkP3QD)-L(bkPxQx) 31166320Sbholler .int L(bkP4QD)-L(bkPxQx) 31176320Sbholler .int L(bkP5QD)-L(bkPxQx) 31186320Sbholler .int L(bkP6QD)-L(bkPxQx) 31196320Sbholler .int L(bkP7QD)-L(bkPxQx) 31206320Sbholler 31216320Sbholler .int L(bkP0QE)-L(bkPxQx) 31226320Sbholler .int L(bkP1QE)-L(bkPxQx) 31236320Sbholler .int L(bkP2QE)-L(bkPxQx) 31246320Sbholler .int L(bkP3QE)-L(bkPxQx) 31256320Sbholler .int L(bkP4QE)-L(bkPxQx) 31266320Sbholler .int L(bkP5QE)-L(bkPxQx) 31276320Sbholler .int L(bkP6QE)-L(bkPxQx) 31286320Sbholler .int L(bkP7QE)-L(bkPxQx) 31296320Sbholler 31306320Sbholler .int L(bkP0QF)-L(bkPxQx) 31316320Sbholler .int L(bkP1QF)-L(bkPxQx) 31326320Sbholler .int L(bkP2QF)-L(bkPxQx) 31336320Sbholler .int L(bkP3QF)-L(bkPxQx) 31346320Sbholler .int L(bkP4QF)-L(bkPxQx) 31356320Sbholler .int L(bkP5QF)-L(bkPxQx) 31366320Sbholler .int L(bkP6QF)-L(bkPxQx) 31376320Sbholler .int L(bkP7QF)-L(bkPxQx) 31386320Sbholler 31396320Sbholler .int L(bkP0QG)-L(bkPxQx) 31406320Sbholler .int L(bkP1QG)-L(bkPxQx) 31416320Sbholler .int L(bkP2QG)-L(bkPxQx) 31426320Sbholler .int L(bkP3QG)-L(bkPxQx) 31436320Sbholler .int L(bkP4QG)-L(bkPxQx) 31446320Sbholler .int L(bkP5QG)-L(bkPxQx) 31456320Sbholler .int L(bkP6QG)-L(bkPxQx) 31466320Sbholler .int L(bkP7QG)-L(bkPxQx) 31476320Sbholler 31486320Sbholler .int L(bkP0QH)-L(bkPxQx) 31496320Sbholler .int L(bkP1QH)-L(bkPxQx) 31506320Sbholler .int L(bkP2QH)-L(bkPxQx) 31516320Sbholler .int L(bkP3QH)-L(bkPxQx) 31526320Sbholler .int L(bkP4QH)-L(bkPxQx) 31536320Sbholler .int L(bkP5QH)-L(bkPxQx) 31546320Sbholler .int L(bkP6QH)-L(bkPxQx) 31556320Sbholler .int L(bkP7QH)-L(bkPxQx) 31566320Sbholler 31576320Sbholler .int L(bkP0QI)-L(bkPxQx) 31586320Sbholler .int L(bkP1QI)-L(bkPxQx) 31596320Sbholler .int L(bkP2QI)-L(bkPxQx) 31606320Sbholler .int L(bkP3QI)-L(bkPxQx) 31616320Sbholler .int L(bkP4QI)-L(bkPxQx) 31626320Sbholler .int L(bkP5QI)-L(bkPxQx) 31636320Sbholler .int L(bkP6QI)-L(bkPxQx) 31646320Sbholler .int L(bkP7QI)-L(bkPxQx) 31656320Sbholler 31660Sstevel@tonic-gate SET_SIZE(memmove) 3167