1TS = 0 2TE = 1 3FROM = 2 4N = 3 5TMP = 3 /* N and TMP don't overlap */ 6TMP1 = 4 7 8TEXT memcpy(SB), $0 9 B _memmove 10TEXT memmove(SB), $0 11_memmove: 12 MOVW R(TS), to+0(FP) /* need to save for return value */ 13 MOVW from+4(FP), R(FROM) 14 MOVW n+8(FP), R(N) 15 16 ADD R(N), R(TS), R(TE) /* to end pointer */ 17 18 CMP R(FROM), R(TS) 19 BLS _forward 20 21_back: 22 ADD R(N), R(FROM) /* from end pointer */ 23 CMP $4, R(N) /* need at least 4 bytes to copy */ 24 BLT _b1tail 25 26_b4align: /* align destination on 4 */ 27 AND.S $3, R(TE), R(TMP) 28 BEQ _b4aligned 29 30 MOVBU.W -1(R(FROM)), R(TMP) /* pre-indexed */ 31 MOVBU.W R(TMP), -1(R(TE)) /* pre-indexed */ 32 B _b4align 33 34_b4aligned: /* is source now aligned? */ 35 AND.S $3, R(FROM), R(TMP) 36 BNE _bunaligned 37 38 ADD $31, R(TS), R(TMP) /* do 32-byte chunks if possible */ 39_b32loop: 40 CMP R(TMP), R(TE) 41 BLS _b4tail 42 43 MOVM.DB.W (R(FROM)), [R4-R7] 44 MOVM.DB.W [R4-R7], (R(TE)) 45 MOVM.DB.W (R(FROM)), [R4-R7] 46 MOVM.DB.W [R4-R7], (R(TE)) 47 B _b32loop 48 49_b4tail: /* do remaining words if possible */ 50 ADD $3, R(TS), R(TMP) 51_b4loop: 52 CMP R(TMP), R(TE) 53 BLS _b1tail 54 55 MOVW.W -4(R(FROM)), R(TMP1) /* pre-indexed */ 56 MOVW.W R(TMP1), -4(R(TE)) /* pre-indexed */ 57 B _b4loop 58 59_b1tail: /* remaining bytes */ 60 CMP R(TE), R(TS) 61 BEQ _return 62 63 MOVBU.W -1(R(FROM)), R(TMP) /* pre-indexed */ 64 MOVBU.W R(TMP), -1(R(TE)) /* pre-indexed */ 65 B _b1tail 66 67_forward: 68 CMP $4, R(N) /* need at least 4 bytes to copy */ 69 BLT _f1tail 70 71_f4align: /* align destination on 4 */ 72 AND.S $3, R(TS), R(TMP) 73 BEQ _f4aligned 74 75 MOVBU.P 1(R(FROM)), R(TMP) /* implicit write back */ 76 MOVBU.P R(TMP), 1(R(TS)) /* implicit write back */ 77 B _f4align 78 79_f4aligned: /* is source now aligned? */ 80 AND.S $3, R(FROM), R(TMP) 81 BNE _funaligned 82 83 SUB $31, R(TE), R(TMP) /* do 32-byte chunks if possible */ 84_f32loop: 85 CMP R(TMP), R(TS) 86 BHS _f4tail 87 88 MOVM.IA.W (R(FROM)), [R4-R7] 89 MOVM.IA.W [R4-R7], (R(TS)) 90 MOVM.IA.W (R(FROM)), [R4-R7] 91 MOVM.IA.W [R4-R7], (R(TS)) 92 B _f32loop 93 94_f4tail: 95 SUB $3, R(TE), R(TMP) /* do remaining words if possible */ 96_f4loop: 97 CMP R(TMP), R(TS) 98 BHS _f1tail 99 100 MOVW.P 4(R(FROM)), R(TMP1) /* implicit write back */ 101 MOVW.P R4, 4(R(TS)) /* implicit write back */ 102 B _f4loop 103 104_f1tail: 105 CMP R(TS), R(TE) 106 BEQ _return 107 108 MOVBU.P 1(R(FROM)), R(TMP) /* implicit write back */ 109 MOVBU.P R(TMP), 1(R(TS)) /* implicit write back */ 110 B _f1tail 111 112_return: 113 MOVW to+0(FP), R0 114 RET 115 116RSHIFT = 4 117LSHIFT = 5 118OFFSET = 11 119 120BR0 = 6 121BW0 = 7 122BR1 = 7 123BW1 = 8 124 125_bunaligned: 126 CMP $2, R(TMP) /* is R(TMP) < 2 ? */ 127 128 MOVW.LT $8, R(RSHIFT) /* (R(n)<<24)|(R(n-1)>>8) */ 129 MOVW.LT $24, R(LSHIFT) 130 MOVW.LT $1, R(OFFSET) 131 132 MOVW.EQ $16, R(RSHIFT) /* (R(n)<<16)|(R(n-1)>>16) */ 133 MOVW.EQ $16, R(LSHIFT) 134 MOVW.EQ $2, R(OFFSET) 135 136 MOVW.GT $24, R(RSHIFT) /* (R(n)<<8)|(R(n-1)>>24) */ 137 MOVW.GT $8, R(LSHIFT) 138 MOVW.GT $3, R(OFFSET) 139 140 ADD $8, R(TS), R(TMP) /* do 8-byte chunks if possible */ 141 CMP R(TMP), R(TE) 142 BLS _b1tail 143 144 BIC $3, R(FROM) /* align source */ 145 MOVW (R(FROM)), R(BR0) /* prime first block register */ 146 147_bu8loop: 148 CMP R(TMP), R(TE) 149 BLS _bu1tail 150 151 MOVW R(BR0)<<R(LSHIFT), R(BW1) 152 MOVM.DB.W (R(FROM)), [R(BR0)-R(BR1)] 153 ORR R(BR1)>>R(RSHIFT), R(BW1) 154 155 MOVW R(BR1)<<R(LSHIFT), R(BW0) 156 ORR R(BR0)>>R(RSHIFT), R(BW0) 157 158 MOVM.DB.W [R(BW0)-R(BW1)], (R(TE)) 159 B _bu8loop 160 161_bu1tail: 162 ADD R(OFFSET), R(FROM) 163 B _b1tail 164 165RSHIFT = 4 166LSHIFT = 5 167OFFSET = 11 168 169FW0 = 6 170FR0 = 7 171FW1 = 7 172FR1 = 8 173 174_funaligned: 175 CMP $2, R(TMP) 176 177 MOVW.LT $8, R(RSHIFT) /* (R(n+1)<<24)|(R(n)>>8) */ 178 MOVW.LT $24, R(LSHIFT) 179 MOVW.LT $3, R(OFFSET) 180 181 MOVW.EQ $16, R(RSHIFT) /* (R(n+1)<<16)|(R(n)>>16) */ 182 MOVW.EQ $16, R(LSHIFT) 183 MOVW.EQ $2, R(OFFSET) 184 185 MOVW.GT $24, R(RSHIFT) /* (R(n+1)<<8)|(R(n)>>24) */ 186 MOVW.GT $8, R(LSHIFT) 187 MOVW.GT $1, R(OFFSET) 188 189 SUB $8, R(TE), R(TMP) /* do 8-byte chunks if possible */ 190 CMP R(TMP), R(TS) 191 BHS _f1tail 192 193 BIC $3, R(FROM) /* align source */ 194 MOVW.P 4(R(FROM)), R(FR1) /* prime last block register, implicit write back */ 195 196_fu8loop: 197 CMP R(TMP), R(TS) 198 BHS _fu1tail 199 200 MOVW R(FR1)>>R(RSHIFT), R(FW0) 201 MOVM.IA.W (R(FROM)), [R(FR0)-R(FR1)] 202 ORR R(FR0)<<R(LSHIFT), R(FW0) 203 204 MOVW R(FR0)>>R(RSHIFT), R(FW1) 205 ORR R(FR1)<<R(LSHIFT), R(FW1) 206 207 MOVM.IA.W [R(FW0)-R(FW1)], (R(TS)) 208 B _fu8loop 209 210_fu1tail: 211 SUB R(OFFSET), R(FROM) 212 B _f1tail 213