1TS = 0 2TE = 1 3FROM = 2 4N = 3 5TMP = 3 /* N and TMP don't overlap */ 6TMP1 = 4 7 8TEXT memcpy(SB), $0 9TEXT memmove(SB), $-4 10_memmove: 11 MOVW R(TS), to+0(FP) /* need to save for return value */ 12 MOVW from+4(FP), R(FROM) 13 MOVW n+8(FP), R(N) 14 15 ADD R(N), R(TS), R(TE) /* to end pointer */ 16 17 CMP R(FROM), R(TS) 18 BLS _forward 19 20_back: 21 ADD R(N), R(FROM) /* from end pointer */ 22 CMP $4, R(N) /* need at least 4 bytes to copy */ 23 BLT _b1tail 24 25_b4align: /* align destination on 4 */ 26 AND.S $3, R(TE), R(TMP) 27 BEQ _b4aligned 28 29 MOVBU.W -1(R(FROM)), R(TMP) /* pre-indexed */ 30 MOVBU.W R(TMP), -1(R(TE)) /* pre-indexed */ 31 B _b4align 32 33_b4aligned: /* is source now aligned? */ 34 AND.S $3, R(FROM), R(TMP) 35 BNE _bunaligned 36 37 ADD $31, R(TS), R(TMP) /* do 32-byte chunks if possible */ 38_b32loop: 39 CMP R(TMP), R(TE) 40 BLS _b4tail 41 42 MOVM.DB.W (R(FROM)), [R4-R11] 43 MOVM.DB.W [R4-R11], (R(TE)) 44 B _b32loop 45 46_b4tail: /* do remaining words if possible */ 47 ADD $3, R(TS), R(TMP) 48_b4loop: 49 CMP R(TMP), R(TE) 50 BLS _b1tail 51 52 MOVW.W -4(R(FROM)), R(TMP1) /* pre-indexed */ 53 MOVW.W R(TMP1), -4(R(TE)) /* pre-indexed */ 54 B _b4loop 55 56_b1tail: /* remaining bytes */ 57 CMP R(TE), R(TS) 58 BEQ _return 59 60 MOVBU.W -1(R(FROM)), R(TMP) /* pre-indexed */ 61 MOVBU.W R(TMP), -1(R(TE)) /* pre-indexed */ 62 B _b1tail 63 64_forward: 65 CMP $4, R(N) /* need at least 4 bytes to copy */ 66 BLT _f1tail 67 68_f4align: /* align destination on 4 */ 69 AND.S $3, R(TS), R(TMP) 70 BEQ _f4aligned 71 72 MOVBU.P 1(R(FROM)), R(TMP) /* implicit write back */ 73 MOVBU.P R(TMP), 1(R(TS)) /* implicit write back */ 74 B _f4align 75 76_f4aligned: /* is source now aligned? */ 77 AND.S $3, R(FROM), R(TMP) 78 BNE _funaligned 79 80 SUB $31, R(TE), R(TMP) /* do 32-byte chunks if possible */ 81_f32loop: 82 CMP R(TMP), R(TS) 83 BHS _f4tail 84 85 MOVM.IA.W (R(FROM)), [R4-R11] 86 MOVM.IA.W [R4-R11], (R(TS)) 87 B _f32loop 88 89_f4tail: 90 SUB $3, R(TE), R(TMP) /* do remaining words if possible */ 91_f4loop: 92 CMP R(TMP), R(TS) 93 BHS _f1tail 94 95 MOVW.P 4(R(FROM)), R(TMP1) /* implicit write back */ 96 MOVW.P R4, 4(R(TS)) /* implicit write back */ 97 B _f4loop 98 99_f1tail: 100 CMP R(TS), R(TE) 101 BEQ _return 102 103 MOVBU.P 1(R(FROM)), R(TMP) /* implicit write back */ 104 MOVBU.P R(TMP), 1(R(TS)) /* implicit write back */ 105 B _f1tail 106 107_return: 108 MOVW to+0(FP), R0 109 RET 110 111RSHIFT = 4 112LSHIFT = 5 113OFFSET = 6 114 115BR0 = 7 116BW0 = 8 117BR1 = 8 118BW1 = 9 119BR2 = 9 120BW2 = 10 121BR3 = 10 122BW3 = 11 123 124_bunaligned: 125 CMP $2, R(TMP) /* is R(TMP) < 2 ? */ 126 127 MOVW.LT $8, R(RSHIFT) /* (R(n)<<24)|(R(n-1)>>8) */ 128 MOVW.LT $24, R(LSHIFT) 129 MOVW.LT $1, R(OFFSET) 130 131 MOVW.EQ $16, R(RSHIFT) /* (R(n)<<16)|(R(n-1)>>16) */ 132 MOVW.EQ $16, R(LSHIFT) 133 MOVW.EQ $2, R(OFFSET) 134 135 MOVW.GT $24, R(RSHIFT) /* (R(n)<<8)|(R(n-1)>>24) */ 136 MOVW.GT $8, R(LSHIFT) 137 MOVW.GT $3, R(OFFSET) 138 139 ADD $16, R(TS), R(TMP) /* do 16-byte chunks if possible */ 140 CMP R(TMP), R(TE) 141 BLS _b1tail 142 143 AND $~0x03, R(FROM) /* align source */ 144 MOVW (R(FROM)), R(BR0) /* prime first block register */ 145 146_bu16loop: 147 CMP R(TMP), R(TE) 148 BLS _bu1tail 149 150 MOVW R(BR0)<<R(LSHIFT), R(BW3) 151 MOVM.DB.W (R(FROM)), [R(BR0)-R(BR3)] 152 ORR R(BR3)>>R(RSHIFT), R(BW3) 153 154 MOVW R(BR3)<<R(LSHIFT), R(BW2) 155 ORR R(BR2)>>R(RSHIFT), R(BW2) 156 157 MOVW R(BR2)<<R(LSHIFT), R(BW1) 158 ORR R(BR1)>>R(RSHIFT), R(BW1) 159 160 MOVW R(BR1)<<R(LSHIFT), R(BW0) 161 ORR R(BR0)>>R(RSHIFT), R(BW0) 162 163 MOVM.DB.W [R(BW0)-R(BW3)], (R(TE)) 164 B _bu16loop 165 166_bu1tail: 167 ADD R(OFFSET), R(FROM) 168 B _b1tail 169 170FW0 = 7 171FR0 = 8 172FW1 = 8 173FR1 = 9 174FW2 = 9 175FR2 = 10 176FW3 = 10 177FR3 = 11 178 179_funaligned: 180 CMP $2, R(TMP) 181 182 MOVW.LT $8, R(RSHIFT) /* (R(n+1)<<24)|(R(n)>>8) */ 183 MOVW.LT $24, R(LSHIFT) 184 MOVW.LT $3, R(OFFSET) 185 186 MOVW.EQ $16, R(RSHIFT) /* (R(n+1)<<16)|(R(n)>>16) */ 187 MOVW.EQ $16, R(LSHIFT) 188 MOVW.EQ $2, R(OFFSET) 189 190 MOVW.GT $24, R(RSHIFT) /* (R(n+1)<<8)|(R(n)>>24) */ 191 MOVW.GT $8, R(LSHIFT) 192 MOVW.GT $1, R(OFFSET) 193 194 SUB $16, R(TE), R(TMP) /* do 16-byte chunks if possible */ 195 CMP R(TMP), R(TS) 196 BHS _f1tail 197 198 AND $~0x03, R(FROM) /* align source */ 199 MOVW.P 4(R(FROM)), R(FR3) /* prime last block register, implicit write back */ 200 201_fu16loop: 202 CMP R(TMP), R(TS) 203 BHS _fu1tail 204 205 MOVW R(FR3)>>R(RSHIFT), R(FW0) 206 MOVM.IA.W (R(FROM)), [R(FR0)-R(FR3)] 207 ORR R(FR0)<<R(LSHIFT), R(FW0) 208 209 MOVW R(FR0)>>R(RSHIFT), R(FW1) 210 ORR R(FR1)<<R(LSHIFT), R(FW1) 211 212 MOVW R(FR1)>>R(RSHIFT), R(FW2) 213 ORR R(FR2)<<R(LSHIFT), R(FW2) 214 215 MOVW R(FR2)>>R(RSHIFT), R(FW3) 216 ORR R(FR3)<<R(LSHIFT), R(FW3) 217 218 MOVM.IA.W [R(FW0)-R(FW3)], (R(TS)) 219 B _fu16loop 220 221_fu1tail: 222 SUB R(OFFSET), R(FROM) 223 B _f1tail 224