1*37da2899SCharles.Forsyth TEXT memset(SB),$0 2*37da2899SCharles.Forsyth#define BDNZ BC 16,0, 3*37da2899SCharles.Forsyth MOVW R3, p+0(FP) /* R3 is pointer */ 4*37da2899SCharles.Forsyth 5*37da2899SCharles.Forsyth/* 6*37da2899SCharles.Forsyth * performance: 7*37da2899SCharles.Forsyth * about 100mbytes/sec (8k blocks) on a 603/105 without L2 cache 8*37da2899SCharles.Forsyth * drops to 40mbytes/sec (10k blocks) and 28mbytes/sec with 32k blocks 9*37da2899SCharles.Forsyth */ 10*37da2899SCharles.Forsyth 11*37da2899SCharles.Forsyth MOVW n+8(FP), R4 /* R4 is count */ 12*37da2899SCharles.Forsyth CMP R4, $0 13*37da2899SCharles.Forsyth BLE ret 14*37da2899SCharles.Forsyth MOVW c+4(FP), R5 /* R5 is char */ 15*37da2899SCharles.Forsyth 16*37da2899SCharles.Forsyth/* 17*37da2899SCharles.Forsyth * create 16 copies of c in R5 .. R8 18*37da2899SCharles.Forsyth */ 19*37da2899SCharles.Forsyth RLWNM $0, R5, $0xff, R5 20*37da2899SCharles.Forsyth RLWMI $8, R5, $0xff00, R5 21*37da2899SCharles.Forsyth RLWMI $16, R5, $0xffff0000, R5 22*37da2899SCharles.Forsyth MOVW R5, R6 23*37da2899SCharles.Forsyth MOVW R5, R7 24*37da2899SCharles.Forsyth MOVW R5, R8 25*37da2899SCharles.Forsyth 26*37da2899SCharles.Forsyth/* 27*37da2899SCharles.Forsyth * let STSW do the work for 16 characters or less; aligned and unaligned 28*37da2899SCharles.Forsyth */ 29*37da2899SCharles.Forsyth CMP R4, $16 30*37da2899SCharles.Forsyth BLE out 31*37da2899SCharles.Forsyth 32*37da2899SCharles.Forsyth/* 33*37da2899SCharles.Forsyth * store enough bytes to align pointer 34*37da2899SCharles.Forsyth */ 35*37da2899SCharles.Forsyth ANDCC $7,R3, R9 36*37da2899SCharles.Forsyth BEQ l2 37*37da2899SCharles.Forsyth SUBC R9, $8, R9 38*37da2899SCharles.Forsyth MOVW R9, XER 39*37da2899SCharles.Forsyth STSW R5, (R3) 40*37da2899SCharles.Forsyth ADD R9, R3 41*37da2899SCharles.Forsyth SUB R9, R4 42*37da2899SCharles.Forsyth 43*37da2899SCharles.Forsyth/* 44*37da2899SCharles.Forsyth * store 16 at a time while there's room 45*37da2899SCharles.Forsyth * STSW was used here originally, but it's `completion serialised' 46*37da2899SCharles.Forsyth */ 47*37da2899SCharles.Forsythl2: 48*37da2899SCharles.Forsyth SRAWCC $4, R4, R9 49*37da2899SCharles.Forsyth BLE out 50*37da2899SCharles.Forsyth MOVW R9, CTR 51*37da2899SCharles.Forsythl3: 52*37da2899SCharles.Forsyth MOVW R5, 0(R3) 53*37da2899SCharles.Forsyth ADD $8, R3, R10 54*37da2899SCharles.Forsyth MOVW R6, 4(R3) 55*37da2899SCharles.Forsyth MOVW R7, 0(R10) 56*37da2899SCharles.Forsyth ADD $8, R10, R3 57*37da2899SCharles.Forsyth MOVW R8, 4(R10) 58*37da2899SCharles.Forsyth BDNZ l3 59*37da2899SCharles.Forsyth RLWNMCC $0, R4, $15, R4 /* residue */ 60*37da2899SCharles.Forsyth BEQ ret 61*37da2899SCharles.Forsyth 62*37da2899SCharles.Forsyth/* 63*37da2899SCharles.Forsyth * store up to 16 bytes from R5 .. R8; aligned and unaligned 64*37da2899SCharles.Forsyth */ 65*37da2899SCharles.Forsyth 66*37da2899SCharles.Forsythout: 67*37da2899SCharles.Forsyth MOVW R4, XER 68*37da2899SCharles.Forsyth STSW R5, (R3) 69*37da2899SCharles.Forsyth 70*37da2899SCharles.Forsythret: 71*37da2899SCharles.Forsyth MOVW 0(FP), R3 72*37da2899SCharles.Forsyth RETURN 73*37da2899SCharles.Forsyth END 74