1*5d9d9091SRichard Lowe/* 2*5d9d9091SRichard Lowe * CDDL HEADER START 3*5d9d9091SRichard Lowe * 4*5d9d9091SRichard Lowe * The contents of this file are subject to the terms of the 5*5d9d9091SRichard Lowe * Common Development and Distribution License (the "License"). 6*5d9d9091SRichard Lowe * You may not use this file except in compliance with the License. 7*5d9d9091SRichard Lowe * 8*5d9d9091SRichard Lowe * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9*5d9d9091SRichard Lowe * or http://www.opensolaris.org/os/licensing. 10*5d9d9091SRichard Lowe * See the License for the specific language governing permissions 11*5d9d9091SRichard Lowe * and limitations under the License. 12*5d9d9091SRichard Lowe * 13*5d9d9091SRichard Lowe * When distributing Covered Code, include this CDDL HEADER in each 14*5d9d9091SRichard Lowe * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15*5d9d9091SRichard Lowe * If applicable, add the following below this CDDL HEADER, with the 16*5d9d9091SRichard Lowe * fields enclosed by brackets "[]" replaced with your own identifying 17*5d9d9091SRichard Lowe * information: Portions Copyright [yyyy] [name of copyright owner] 18*5d9d9091SRichard Lowe * 19*5d9d9091SRichard Lowe * CDDL HEADER END 20*5d9d9091SRichard Lowe */ 21*5d9d9091SRichard Lowe 22*5d9d9091SRichard Lowe/* 23*5d9d9091SRichard Lowe * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24*5d9d9091SRichard Lowe * Use is subject to license terms. 25*5d9d9091SRichard Lowe */ 26*5d9d9091SRichard Lowe 27*5d9d9091SRichard Lowe .file "memcpy.s" 28*5d9d9091SRichard Lowe 29*5d9d9091SRichard Lowe#include <sys/asm_linkage.h> 30*5d9d9091SRichard Lowe 31*5d9d9091SRichard Lowe ANSI_PRAGMA_WEAK(memmove,function) 32*5d9d9091SRichard Lowe ANSI_PRAGMA_WEAK(memcpy,function) 33*5d9d9091SRichard Lowe 34*5d9d9091SRichard Lowe ENTRY(memmove) 35*5d9d9091SRichard Lowe movl 0+12(%esp),%ecx / get number of bytes to move 36*5d9d9091SRichard Lowe pushl %esi / save off %edi, %esi and move destination 37*5d9d9091SRichard Lowe pushl %edi 38*5d9d9091SRichard Lowe movl 8+ 4(%esp),%edi / destination buffer address 39*5d9d9091SRichard Lowe movl 8+ 8(%esp),%esi / source buffer address 40*5d9d9091SRichard Lowe movl %edi, %eax 41*5d9d9091SRichard Lowe testl %ecx,%ecx 42*5d9d9091SRichard Lowe jz .Return 43*5d9d9091SRichard Lowe 44*5d9d9091SRichard Lowe cmpl %esi,%edi / if (source addr > dest addr) 45*5d9d9091SRichard Lowe leal -1(%esi,%ecx),%edx / %edx = src + size - 1 46*5d9d9091SRichard Lowe jbe .memcpy_post / jump if dst <= src 47*5d9d9091SRichard Lowe cmpl %edx,%edi 48*5d9d9091SRichard Lowe jbe .CopyLeft / jump if dst <= src + size - 1 49*5d9d9091SRichard Lowe jmp .memcpy_post 50*5d9d9091SRichard Lowe 51*5d9d9091SRichard Lowe ENTRY(memcpy) 52*5d9d9091SRichard Lowe pushl %esi 53*5d9d9091SRichard Lowe pushl %edi 54*5d9d9091SRichard Lowe 55*5d9d9091SRichard Lowe movl 8+4(%esp),%edi / %edi = dest address 56*5d9d9091SRichard Lowe movl %edi, %eax / save this 57*5d9d9091SRichard Lowe movl 8+8(%esp),%esi / %esi = source address 58*5d9d9091SRichard Lowe movl 8+12(%esp),%ecx/ %ecx = length of string 59*5d9d9091SRichard Lowe / %edx scratch register 60*5d9d9091SRichard Lowe / %eax scratch register 61*5d9d9091SRichard Lowe.memcpy_post: 62*5d9d9091SRichard Lowe nop / this really helps, don't know why 63*5d9d9091SRichard Lowe / note: cld is perf death on P4 64*5d9d9091SRichard Lowe cmpl $63,%ecx 65*5d9d9091SRichard Lowe ja .move_sse / not worth doing sse for less 66*5d9d9091SRichard Lowe 67*5d9d9091SRichard Lowe.movew: 68*5d9d9091SRichard Lowe movl %ecx,%edx / save byte cnt 69*5d9d9091SRichard Lowe shrl $2,%ecx / %ecx = number of words to move 70*5d9d9091SRichard Lowe rep ; smovl / move the words 71*5d9d9091SRichard Lowe 72*5d9d9091SRichard Lowe 73*5d9d9091SRichard Lowe andl $0x3,%edx / %edx = number of bytes left to move 74*5d9d9091SRichard Lowe jz .Return / %edx <= 3, so just unroll the loop 75*5d9d9091SRichard Lowe 76*5d9d9091SRichard Lowe movb (%esi), %cl 77*5d9d9091SRichard Lowe movb %cl, (%edi) 78*5d9d9091SRichard Lowe decl %edx 79*5d9d9091SRichard Lowe jz .Return 80*5d9d9091SRichard Lowe movb 1(%esi), %cl 81*5d9d9091SRichard Lowe movb %cl, 1(%edi) 82*5d9d9091SRichard Lowe decl %edx 83*5d9d9091SRichard Lowe jz .Return 84*5d9d9091SRichard Lowe movb 2(%esi), %cl 85*5d9d9091SRichard Lowe movb %cl, 2(%edi) 86*5d9d9091SRichard Lowe 87*5d9d9091SRichard Lowe.Return: 88*5d9d9091SRichard Lowe popl %edi / restore register variables 89*5d9d9091SRichard Lowe popl %esi 90*5d9d9091SRichard Lowe ret 91*5d9d9091SRichard Lowe 92*5d9d9091SRichard Lowe.move_sse: 93*5d9d9091SRichard Lowe / 94*5d9d9091SRichard Lowe / time to 16 byte align destination 95*5d9d9091SRichard Lowe / 96*5d9d9091SRichard Lowe andl $15, %eax 97*5d9d9091SRichard Lowe jnz .sse_unaligned / jmp if dest is unaligned 98*5d9d9091SRichard Lowe.sse: / dest is aligned, check source 99*5d9d9091SRichard Lowe movl %ecx, %edx / get byte count 100*5d9d9091SRichard Lowe shrl $6, %edx / number of 64 byte blocks to move 101*5d9d9091SRichard Lowe testl $15, %esi 102*5d9d9091SRichard Lowe jnz .sse_da / go to slow loop if source is unaligned 103*5d9d9091SRichard Lowe cmpl $65535, %ecx 104*5d9d9091SRichard Lowe ja .sse_sa_nt_loop 105*5d9d9091SRichard Lowe 106*5d9d9091SRichard Lowe / 107*5d9d9091SRichard Lowe / use aligned load since we're lucky 108*5d9d9091SRichard Lowe / 109*5d9d9091SRichard Lowe.sse_sa_loop: 110*5d9d9091SRichard Lowe prefetcht0 568(%esi) / prefetch source & copy 64 byte at a time 111*5d9d9091SRichard Lowe prefetcht0 568(%edi) / prefetch source & copy 64 byte at a time 112*5d9d9091SRichard Lowe movaps 0(%esi), %xmm0 113*5d9d9091SRichard Lowe movaps %xmm0, 0(%edi) 114*5d9d9091SRichard Lowe movaps 16(%esi), %xmm1 115*5d9d9091SRichard Lowe movaps %xmm1, 16(%edi) 116*5d9d9091SRichard Lowe movaps 32(%esi), %xmm2 117*5d9d9091SRichard Lowe movaps %xmm2, 32(%edi) 118*5d9d9091SRichard Lowe movaps 48(%esi), %xmm3 119*5d9d9091SRichard Lowe movaps %xmm3, 48(%edi) 120*5d9d9091SRichard Lowe addl $64, %esi 121*5d9d9091SRichard Lowe addl $64, %edi 122*5d9d9091SRichard Lowe decl %edx 123*5d9d9091SRichard Lowe jnz .sse_sa_loop 124*5d9d9091SRichard Lowe 125*5d9d9091SRichard Lowe.sse_cleanup: 126*5d9d9091SRichard Lowe andl $63, %ecx / compute remaining bytes 127*5d9d9091SRichard Lowe movl 8+4(%esp), %eax / setup return value 128*5d9d9091SRichard Lowe jz .Return 129*5d9d9091SRichard Lowe jmp .movew 130*5d9d9091SRichard Lowe 131*5d9d9091SRichard Lowe / 132*5d9d9091SRichard Lowe / use aligned load since we're lucky 133*5d9d9091SRichard Lowe / 134*5d9d9091SRichard Lowe .align 16 135*5d9d9091SRichard Lowe.sse_sa_nt_loop: 136*5d9d9091SRichard Lowe prefetchnta 16384(%esi) / prefetch source & copy 64 byte at a time 137*5d9d9091SRichard Lowe movaps (%esi), %xmm0 138*5d9d9091SRichard Lowe movntps %xmm0, 0(%edi) 139*5d9d9091SRichard Lowe movaps 16(%esi), %xmm1 140*5d9d9091SRichard Lowe movntps %xmm1, 16(%edi) 141*5d9d9091SRichard Lowe movaps 32(%esi), %xmm2 142*5d9d9091SRichard Lowe movntps %xmm2, 32(%edi) 143*5d9d9091SRichard Lowe movaps 48(%esi), %xmm3 144*5d9d9091SRichard Lowe movntps %xmm3, 48(%edi) 145*5d9d9091SRichard Lowe addl $64, %esi 146*5d9d9091SRichard Lowe addl $64, %edi 147*5d9d9091SRichard Lowe decl %edx 148*5d9d9091SRichard Lowe jnz .sse_sa_nt_loop 149*5d9d9091SRichard Lowe#if defined(_SSE2_INSN) 150*5d9d9091SRichard Lowe mfence 151*5d9d9091SRichard Lowe#elif defined(_SSE_INSN) 152*5d9d9091SRichard Lowe sfence 153*5d9d9091SRichard Lowe#else 154*5d9d9091SRichard Lowe#error "Must have either SSE or SSE2" 155*5d9d9091SRichard Lowe#endif 156*5d9d9091SRichard Lowe jmp .sse_cleanup 157*5d9d9091SRichard Lowe 158*5d9d9091SRichard Lowe / 159*5d9d9091SRichard Lowe / Make certain that destination buffer becomes aligned 160*5d9d9091SRichard Lowe / 161*5d9d9091SRichard Lowe.sse_unaligned: 162*5d9d9091SRichard Lowe neg %eax / subtract from 16 and get destination 163*5d9d9091SRichard Lowe andl $15, %eax / aligned on a 16 byte boundary 164*5d9d9091SRichard Lowe movl %ecx, %edx / saved count 165*5d9d9091SRichard Lowe subl %eax, %ecx / subtract from byte count 166*5d9d9091SRichard Lowe cmpl $64, %ecx / after aligning, will we still have 64 bytes? 167*5d9d9091SRichard Lowe cmovb %edx, %ecx / if not, restore original byte count, 168*5d9d9091SRichard Lowe cmovb 8+4(%esp), %eax / and restore return value, 169*5d9d9091SRichard Lowe jb .movew / and do a non-SSE move. 170*5d9d9091SRichard Lowe xchg %ecx, %eax / flip for copy 171*5d9d9091SRichard Lowe rep ; smovb / move the bytes 172*5d9d9091SRichard Lowe xchg %ecx, %eax / flip back 173*5d9d9091SRichard Lowe jmp .sse 174*5d9d9091SRichard Lowe 175*5d9d9091SRichard Lowe .align 16 176*5d9d9091SRichard Lowe.sse_da: 177*5d9d9091SRichard Lowe cmpl $65535, %ecx 178*5d9d9091SRichard Lowe jbe .sse_da_loop 179*5d9d9091SRichard Lowe 180*5d9d9091SRichard Lowe / 181*5d9d9091SRichard Lowe / use unaligned load since source doesn't line up 182*5d9d9091SRichard Lowe / 183*5d9d9091SRichard Lowe.sse_da_nt_loop: 184*5d9d9091SRichard Lowe prefetchnta 16384(%esi) / prefetch source & copy 64 byte at a time 185*5d9d9091SRichard Lowe movups 0(%esi), %xmm0 186*5d9d9091SRichard Lowe movntps %xmm0, 0(%edi) 187*5d9d9091SRichard Lowe movups 16(%esi), %xmm1 188*5d9d9091SRichard Lowe movntps %xmm1, 16(%edi) 189*5d9d9091SRichard Lowe movups 32(%esi), %xmm2 190*5d9d9091SRichard Lowe movntps %xmm2, 32(%edi) 191*5d9d9091SRichard Lowe movups 48(%esi), %xmm3 192*5d9d9091SRichard Lowe movntps %xmm3, 48(%edi) 193*5d9d9091SRichard Lowe addl $64, %esi 194*5d9d9091SRichard Lowe addl $64, %edi 195*5d9d9091SRichard Lowe decl %edx 196*5d9d9091SRichard Lowe jnz .sse_da_nt_loop 197*5d9d9091SRichard Lowe#if defined(_SSE2_INSN) 198*5d9d9091SRichard Lowe mfence 199*5d9d9091SRichard Lowe#elif defined(_SSE_INSN) 200*5d9d9091SRichard Lowe sfence 201*5d9d9091SRichard Lowe#else 202*5d9d9091SRichard Lowe#error "Must have either SSE or SSE2" 203*5d9d9091SRichard Lowe#endif 204*5d9d9091SRichard Lowe jmp .sse_cleanup 205*5d9d9091SRichard Lowe / 206*5d9d9091SRichard Lowe / use unaligned load since source doesn't line up 207*5d9d9091SRichard Lowe / 208*5d9d9091SRichard Lowe .align 16 209*5d9d9091SRichard Lowe.sse_da_loop: 210*5d9d9091SRichard Lowe prefetcht0 568(%esi) / prefetch source & copy 64 byte at a time 211*5d9d9091SRichard Lowe prefetcht0 568(%edi) 212*5d9d9091SRichard Lowe movups 0(%esi), %xmm0 213*5d9d9091SRichard Lowe movaps %xmm0, 0(%edi) 214*5d9d9091SRichard Lowe movups 16(%esi), %xmm1 215*5d9d9091SRichard Lowe movaps %xmm1, 16(%edi) 216*5d9d9091SRichard Lowe movups 32(%esi), %xmm2 217*5d9d9091SRichard Lowe movaps %xmm2, 32(%edi) 218*5d9d9091SRichard Lowe movups 48(%esi), %xmm3 219*5d9d9091SRichard Lowe movaps %xmm3, 48(%edi) 220*5d9d9091SRichard Lowe addl $64, %esi 221*5d9d9091SRichard Lowe addl $64, %edi 222*5d9d9091SRichard Lowe decl %edx 223*5d9d9091SRichard Lowe jnz .sse_da_loop 224*5d9d9091SRichard Lowe jmp .sse_cleanup 225*5d9d9091SRichard Lowe 226*5d9d9091SRichard Lowe SET_SIZE(memcpy) 227*5d9d9091SRichard Lowe 228*5d9d9091SRichard Lowe 229*5d9d9091SRichard Lowe/ .CopyLeft handles the memmove case where we must perform the copy backwards, 230*5d9d9091SRichard Lowe/ because of overlap between src and dst. This is not particularly optimized. 231*5d9d9091SRichard Lowe 232*5d9d9091SRichard Lowe.CopyLeft: 233*5d9d9091SRichard Lowe movl $3,%eax / heavily used constant 234*5d9d9091SRichard Lowe std / reverse direction bit (RtoL) 235*5d9d9091SRichard Lowe cmpl $12,%ecx / if (size < 12) 236*5d9d9091SRichard Lowe ja .BigCopyLeft / { 237*5d9d9091SRichard Lowe movl %edx,%esi / src = src + size - 1 238*5d9d9091SRichard Lowe leal -1(%ecx,%edi),%edi / dst = dst + size - 1 239*5d9d9091SRichard Lowe rep; smovb / do the byte copy 240*5d9d9091SRichard Lowe cld / reset direction flag to LtoR 241*5d9d9091SRichard Lowe popl %edi / } 242*5d9d9091SRichard Lowe popl %esi / restore registers 243*5d9d9091SRichard Lowe movl 4(%esp),%eax / set up return value 244*5d9d9091SRichard Lowe ret / return(dba); 245*5d9d9091SRichard Lowe.BigCopyLeft: / } else { 246*5d9d9091SRichard Lowe xchgl %edx,%ecx 247*5d9d9091SRichard Lowe movl %ecx,%esi / align source w/byte copy 248*5d9d9091SRichard Lowe leal -1(%edx,%edi),%edi 249*5d9d9091SRichard Lowe andl %eax,%ecx 250*5d9d9091SRichard Lowe jz .SkipAlignLeft 251*5d9d9091SRichard Lowe addl $1, %ecx / we need to insure that future 252*5d9d9091SRichard Lowe subl %ecx,%edx / copy is done on aligned boundary 253*5d9d9091SRichard Lowe rep; smovb 254*5d9d9091SRichard Lowe.SkipAlignLeft: 255*5d9d9091SRichard Lowe movl %edx,%ecx 256*5d9d9091SRichard Lowe subl %eax,%esi 257*5d9d9091SRichard Lowe shrl $2,%ecx / do 4 byte copy RtoL 258*5d9d9091SRichard Lowe subl %eax,%edi 259*5d9d9091SRichard Lowe rep; smovl 260*5d9d9091SRichard Lowe andl %eax,%edx / do 1 byte copy whats left 261*5d9d9091SRichard Lowe jz .CleanupReturnLeft 262*5d9d9091SRichard Lowe movl %edx,%ecx 263*5d9d9091SRichard Lowe addl %eax,%esi / rep; smovl instruction will decrement 264*5d9d9091SRichard Lowe addl %eax,%edi / %edi, %esi by four after each copy 265*5d9d9091SRichard Lowe / adding 3 will restore pointers to byte 266*5d9d9091SRichard Lowe / before last double word copied 267*5d9d9091SRichard Lowe / which is where they are expected to 268*5d9d9091SRichard Lowe / be for the single byte copy code 269*5d9d9091SRichard Lowe rep; smovb 270*5d9d9091SRichard Lowe.CleanupReturnLeft: 271*5d9d9091SRichard Lowe cld / reset direction flag to LtoR 272*5d9d9091SRichard Lowe popl %edi 273*5d9d9091SRichard Lowe popl %esi / restore registers 274*5d9d9091SRichard Lowe movl 4(%esp),%eax / set up return value 275*5d9d9091SRichard Lowe ret / return(dba); 276*5d9d9091SRichard Lowe SET_SIZE(memmove) 277