10Sstevel@tonic-gate/* 20Sstevel@tonic-gate * CDDL HEADER START 30Sstevel@tonic-gate * 40Sstevel@tonic-gate * The contents of this file are subject to the terms of the 56812Sraf * Common Development and Distribution License (the "License"). 66812Sraf * You may not use this file except in compliance with the License. 70Sstevel@tonic-gate * 80Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 90Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing. 100Sstevel@tonic-gate * See the License for the specific language governing permissions 110Sstevel@tonic-gate * and limitations under the License. 120Sstevel@tonic-gate * 130Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each 140Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 150Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the 160Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying 170Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner] 180Sstevel@tonic-gate * 190Sstevel@tonic-gate * CDDL HEADER END 200Sstevel@tonic-gate */ 216812Sraf 220Sstevel@tonic-gate/* 236812Sraf * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 246812Sraf * Use is subject to license terms. 250Sstevel@tonic-gate */ 260Sstevel@tonic-gate 27*7298SMark.J.Nelson@Sun.COM .file "__align_cpy_4.s" 280Sstevel@tonic-gate 290Sstevel@tonic-gate/* __align_cpy_4(s1, s2, n) 300Sstevel@tonic-gate * 310Sstevel@tonic-gate * Copy 4-byte aligned source to 4-byte aligned target in multiples of 4 bytes. 320Sstevel@tonic-gate * 330Sstevel@tonic-gate * Input: 340Sstevel@tonic-gate * o0 address of target 350Sstevel@tonic-gate * o1 address of source 360Sstevel@tonic-gate * o2 number of bytes to copy (must be a multiple of 4) 370Sstevel@tonic-gate * Output: 380Sstevel@tonic-gate * o0 address of target 390Sstevel@tonic-gate * Caller's registers that have been changed by this function: 400Sstevel@tonic-gate * o1-o5, g1, g5 410Sstevel@tonic-gate * 420Sstevel@tonic-gate * Note: 430Sstevel@tonic-gate * This helper routine will not be used by any 32-bit compilations. 440Sstevel@tonic-gate * To do so would break binary compatibility with previous versions of 450Sstevel@tonic-gate * Solaris. 460Sstevel@tonic-gate * 470Sstevel@tonic-gate * Assumptions: 480Sstevel@tonic-gate * Source and target addresses are 4-byte aligned. 490Sstevel@tonic-gate * Bytes to be copied are non-overlapping or _exactly_ overlapping. 500Sstevel@tonic-gate * The number of bytes to be copied is a multiple of 4. 510Sstevel@tonic-gate * Call will usually be made with a byte count of more than 4*4 and 520Sstevel@tonic-gate * less than a few hundred bytes. Legal values are 0 to MAX_SIZE_T. 530Sstevel@tonic-gate * 540Sstevel@tonic-gate * Optimization attempt: 550Sstevel@tonic-gate * Reasonable speed for a generic v9. 560Sstevel@tonic-gate */ 570Sstevel@tonic-gate 580Sstevel@tonic-gate#include <sys/asm_linkage.h> 590Sstevel@tonic-gate 600Sstevel@tonic-gate ENTRY(__align_cpy_4) 610Sstevel@tonic-gate brz,pn %o2, .done ! Skip out if no bytes to copy. 620Sstevel@tonic-gate cmp %o0, %o1 630Sstevel@tonic-gate be,pn %xcc, .done ! Addresses are identical--done. 640Sstevel@tonic-gate and %o0, 7, %o3 ! Is target 8-byte aligned? 650Sstevel@tonic-gate and %o1, 7, %o4 ! Is source 8-byte aligned? 660Sstevel@tonic-gate cmp %o3, %o4 670Sstevel@tonic-gate bne,pt %icc, .noton8 ! Exactly one of source and target is 680Sstevel@tonic-gate mov %o0, %g1 ! 8-byte aligned. 690Sstevel@tonic-gate brz,pt %o3, .both8 ! Both are 8-byte aligned. 700Sstevel@tonic-gate nop 710Sstevel@tonic-gate 720Sstevel@tonic-gate ld [%o1], %o3 ! Neither is aligned, so do 4 bytes; 730Sstevel@tonic-gate subcc %o2, 4, %o2 ! then both will be aligned. 740Sstevel@tonic-gate st %o3, [%g1] 750Sstevel@tonic-gate bz,pn %xcc, .done 760Sstevel@tonic-gate add %g1, 4, %g1 770Sstevel@tonic-gate b .both8 780Sstevel@tonic-gate add %o1, 4, %o1 790Sstevel@tonic-gate 800Sstevel@tonic-gate! Section of code dealing with case where source and target are both 8-byte 810Sstevel@tonic-gate! aligned. Get and store 16 bytes at a time using ldx and stx. 820Sstevel@tonic-gate 830Sstevel@tonic-gate .align 32 840Sstevel@tonic-gate.both8: ! Both source and target are aligned. 850Sstevel@tonic-gate cmp %o2, 16 860Sstevel@tonic-gate bl,a,pn %xcc, .chkwd 870Sstevel@tonic-gate cmp %o2, 8 880Sstevel@tonic-gate 890Sstevel@tonic-gate sub %o2, 12, %o2 900Sstevel@tonic-gate.loop16a: ! Load and store 16 bytes at a time. 910Sstevel@tonic-gate ldx [%o1], %o3 920Sstevel@tonic-gate ldx [%o1+8], %o4 930Sstevel@tonic-gate subcc %o2, 16, %o2 940Sstevel@tonic-gate stx %o3, [%g1] 950Sstevel@tonic-gate stx %o4, [%g1+8] 960Sstevel@tonic-gate add %o1, 16, %o1 970Sstevel@tonic-gate bg,pt %xcc, .loop16a ! Have at least 16 bytes left. 980Sstevel@tonic-gate add %g1, 16, %g1 990Sstevel@tonic-gate 1000Sstevel@tonic-gate addcc %o2, 12, %o2 1010Sstevel@tonic-gate bg,a,pt %xcc, .chkwd ! Have some remaining bytes. 1020Sstevel@tonic-gate cmp %o2, 8 1030Sstevel@tonic-gate retl 1040Sstevel@tonic-gate nop 1050Sstevel@tonic-gate 1060Sstevel@tonic-gate.chkwd: 1070Sstevel@tonic-gate bl,a,pn %xcc, .wrword ! Only 4 bytes left. 1080Sstevel@tonic-gate ld [%o1], %o3 1090Sstevel@tonic-gate 1100Sstevel@tonic-gate ldx [%o1], %o3 ! Have 8 or 12, so do 8. 1110Sstevel@tonic-gate stx %o3, [%g1] 1120Sstevel@tonic-gate add %o1, 8, %o1 1130Sstevel@tonic-gate add %g1, 8, %g1 1140Sstevel@tonic-gate subcc %o2, 8, %o2 1150Sstevel@tonic-gate bg,a,pn %xcc, .wrword ! Still have four to do. 1160Sstevel@tonic-gate ld [%o1], %o3 1170Sstevel@tonic-gate 1180Sstevel@tonic-gate retl 1190Sstevel@tonic-gate nop 1200Sstevel@tonic-gate 1210Sstevel@tonic-gate.wrword: ! Copy final word. 1220Sstevel@tonic-gate st %o3, [%g1] 1230Sstevel@tonic-gate 1240Sstevel@tonic-gate.done: 1250Sstevel@tonic-gate retl 1260Sstevel@tonic-gate nop 1270Sstevel@tonic-gate 1280Sstevel@tonic-gate! Section of code where either source or target, but not both, are 8-byte 1290Sstevel@tonic-gate! aligned. So, use ld and st instructions rather than trying to copy stuff 1300Sstevel@tonic-gate! around in registers. 1310Sstevel@tonic-gate 1320Sstevel@tonic-gate .align 32 ! Ultra cache line boundary. 1330Sstevel@tonic-gate.noton8: 1340Sstevel@tonic-gate add %o1, %o2, %g5 ! Ending address of source. 1350Sstevel@tonic-gate andcc %o2, 15, %o3 ! Mod 16 of number of bytes to copy. 1360Sstevel@tonic-gate bz,pn %xcc, .loop16 ! Copy odd amounts first, then multiples of 16. 1370Sstevel@tonic-gate cmp %o3, 4 1380Sstevel@tonic-gate bz,pn %xcc, .mod4 1390Sstevel@tonic-gate cmp %o3, 8 1400Sstevel@tonic-gate bz,pn %xcc, .mod8 1410Sstevel@tonic-gate cmp %o3, 12 1420Sstevel@tonic-gate bz,pt %xcc, .mod12 1430Sstevel@tonic-gate nop 1440Sstevel@tonic-gate illtrap 0 ! Size not valid. 1450Sstevel@tonic-gate 1460Sstevel@tonic-gate.mod4: ! Do first 4 bytes, then do multiples of 16. 1470Sstevel@tonic-gate lduw [%o1], %o2 1480Sstevel@tonic-gate add %o1, 4, %o1 1490Sstevel@tonic-gate st %o2, [%g1] 1500Sstevel@tonic-gate cmp %o1, %g5 1510Sstevel@tonic-gate bl,a,pt %xcc, .loop16 1520Sstevel@tonic-gate add %g1, 4, %g1 1530Sstevel@tonic-gate retl 1540Sstevel@tonic-gate nop 1550Sstevel@tonic-gate.mod8: ! Do first 8 bytes, then do multiples of 16. 1560Sstevel@tonic-gate lduw [%o1], %o2 1570Sstevel@tonic-gate lduw [%o1+4], %o3 1580Sstevel@tonic-gate add %o1, 8, %o1 1590Sstevel@tonic-gate st %o2, [%g1] 1600Sstevel@tonic-gate st %o3, [%g1+4] 1610Sstevel@tonic-gate cmp %o1, %g5 1620Sstevel@tonic-gate bl,a,pt %xcc, .loop16 1630Sstevel@tonic-gate add %g1, 8, %g1 1640Sstevel@tonic-gate retl 1650Sstevel@tonic-gate nop 1660Sstevel@tonic-gate.mod12: ! Do first 12 bytes, then do multiples of 16. 1670Sstevel@tonic-gate lduw [%o1], %o2 1680Sstevel@tonic-gate lduw [%o1+4], %o3 1690Sstevel@tonic-gate lduw [%o1+8], %o4 1700Sstevel@tonic-gate add %o1, 12, %o1 1710Sstevel@tonic-gate st %o2, [%g1] 1720Sstevel@tonic-gate st %o3, [%g1+4] 1730Sstevel@tonic-gate st %o4, [%g1+8] 1740Sstevel@tonic-gate cmp %o1, %g5 1750Sstevel@tonic-gate bl,a,pt %xcc, .loop16 1760Sstevel@tonic-gate add %g1, 12, %g1 1770Sstevel@tonic-gate retl 1780Sstevel@tonic-gate nop 1790Sstevel@tonic-gate .align 32 ! Ultra cache line boundary. 1800Sstevel@tonic-gate.loop16: ! Do multiples of 16 bytes. 1810Sstevel@tonic-gate lduw [%o1], %o2 1820Sstevel@tonic-gate lduw [%o1+4], %o3 1830Sstevel@tonic-gate lduw [%o1+8], %o4 1840Sstevel@tonic-gate lduw [%o1+12], %o5 1850Sstevel@tonic-gate add %o1, 16, %o1 1860Sstevel@tonic-gate st %o2, [%g1] 1870Sstevel@tonic-gate st %o3, [%g1+4] 1880Sstevel@tonic-gate cmp %o1, %g5 1890Sstevel@tonic-gate st %o4, [%g1+8] 1900Sstevel@tonic-gate st %o5, [%g1+12] 1910Sstevel@tonic-gate bl,a,pt %xcc, .loop16 1920Sstevel@tonic-gate add %g1, 16,%g1 1930Sstevel@tonic-gate retl ! Target address is already in o0. 1940Sstevel@tonic-gate nop 1950Sstevel@tonic-gate 1960Sstevel@tonic-gate SET_SIZE(__align_cpy_4) 197