1*37da2899SCharles.ForsythTO = 1 2*37da2899SCharles.ForsythTOE = 2 3*37da2899SCharles.ForsythN = 3 4*37da2899SCharles.ForsythTMP = 3 /* N and TMP don't overlap */ 5*37da2899SCharles.Forsyth 6*37da2899SCharles.ForsythTEXT memset(SB), $0 7*37da2899SCharles.Forsyth MOVW R0, R(TO) 8*37da2899SCharles.Forsyth MOVW data+4(FP), R(4) 9*37da2899SCharles.Forsyth MOVW n+8(FP), R(N) 10*37da2899SCharles.Forsyth 11*37da2899SCharles.Forsyth ADD R(N), R(TO), R(TOE) /* to end pointer */ 12*37da2899SCharles.Forsyth 13*37da2899SCharles.Forsyth CMP $4, R(N) /* need at least 4 bytes to copy */ 14*37da2899SCharles.Forsyth BLT _1tail 15*37da2899SCharles.Forsyth 16*37da2899SCharles.Forsyth AND $0xFF, R(4) /* it's a byte */ 17*37da2899SCharles.Forsyth SLL $8, R(4), R(TMP) /* replicate to a word */ 18*37da2899SCharles.Forsyth ORR R(TMP), R(4) 19*37da2899SCharles.Forsyth SLL $16, R(4), R(TMP) 20*37da2899SCharles.Forsyth ORR R(TMP), R(4) 21*37da2899SCharles.Forsyth 22*37da2899SCharles.Forsyth_4align: /* align on 4 */ 23*37da2899SCharles.Forsyth AND.S $3, R(TO), R(TMP) 24*37da2899SCharles.Forsyth BEQ _4aligned 25*37da2899SCharles.Forsyth 26*37da2899SCharles.Forsyth MOVBU.P R(4), 1(R(TO)) /* implicit write back */ 27*37da2899SCharles.Forsyth B _4align 28*37da2899SCharles.Forsyth 29*37da2899SCharles.Forsyth_4aligned: 30*37da2899SCharles.Forsyth SUB $31, R(TOE), R(TMP) /* do 32-byte chunks if possible */ 31*37da2899SCharles.Forsyth CMP R(TMP), R(TO) 32*37da2899SCharles.Forsyth BHS _4tail 33*37da2899SCharles.Forsyth 34*37da2899SCharles.Forsyth MOVW R4, R5 /* replicate */ 35*37da2899SCharles.Forsyth MOVW R4, R6 36*37da2899SCharles.Forsyth MOVW R4, R7 37*37da2899SCharles.Forsyth MOVW R4, R8 38*37da2899SCharles.Forsyth MOVW R4, R9 39*37da2899SCharles.Forsyth MOVW R4, R10 40*37da2899SCharles.Forsyth MOVW R4, R11 41*37da2899SCharles.Forsyth 42*37da2899SCharles.Forsyth_f32loop: 43*37da2899SCharles.Forsyth CMP R(TMP), R(TO) 44*37da2899SCharles.Forsyth BHS _4tail 45*37da2899SCharles.Forsyth 46*37da2899SCharles.Forsyth MOVM.IA.W [R4-R11], (R(TO)) 47*37da2899SCharles.Forsyth B _f32loop 48*37da2899SCharles.Forsyth 49*37da2899SCharles.Forsyth_4tail: 50*37da2899SCharles.Forsyth SUB $3, R(TOE), R(TMP) /* do remaining words if possible */ 51*37da2899SCharles.Forsyth_4loop: 52*37da2899SCharles.Forsyth CMP R(TMP), R(TO) 53*37da2899SCharles.Forsyth BHS _1tail 54*37da2899SCharles.Forsyth 55*37da2899SCharles.Forsyth MOVW.P R(4), 4(R(TO)) /* implicit write back */ 56*37da2899SCharles.Forsyth B _4loop 57*37da2899SCharles.Forsyth 58*37da2899SCharles.Forsyth_1tail: 59*37da2899SCharles.Forsyth CMP R(TO), R(TOE) 60*37da2899SCharles.Forsyth BEQ _return 61*37da2899SCharles.Forsyth 62*37da2899SCharles.Forsyth MOVBU.P R(4), 1(R(TO)) /* implicit write back */ 63*37da2899SCharles.Forsyth B _1tail 64*37da2899SCharles.Forsyth 65*37da2899SCharles.Forsyth_return: 66*37da2899SCharles.Forsyth RET 67