1 #include <u.h> 2 #include <libc.h> 3 4 enum 5 { 6 Bit1 = 7, 7 Bitx = 6, 8 Bit2 = 5, 9 Bit3 = 4, 10 Bit4 = 3, 11 12 T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */ 13 Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */ 14 T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */ 15 T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */ 16 T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */ 17 T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */ 18 19 Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0000 0000 0111 1111 */ 20 Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0000 0000 0111 1111 1111 */ 21 Rune3 = (1<<(Bit3+2*Bitx))-1, /* 0000 0000 1111 1111 1111 1111 */ 22 Rune4 = (1<<(Bit4+3*Bitx))-1, /* 0001 1111 1111 1111 1111 1111 */ 23 24 Maskx = (1<<Bitx)-1, /* 0011 1111 */ 25 Testx = Maskx ^ 0xFF, /* 1100 0000 */ 26 27 SurrogateMin = 0xD800, 28 SurrogateMax = 0xDFFF, 29 30 Bad = Runeerror, 31 }; 32 33 int 34 chartorune(Rune *rune, char *str) 35 { 36 int c, c1, c2, c3; 37 long l; 38 39 /* 40 * one character sequence 41 * 00000-0007F => T1 42 */ 43 c = *(uchar*)str; 44 if(c < Tx) { 45 *rune = c; 46 return 1; 47 } 48 49 /* 50 * two character sequence 51 * 00080-007FF => T2 Tx 52 */ 53 c1 = *(uchar*)(str+1) ^ Tx; 54 if(c1 & Testx) 55 goto bad; 56 if(c < T3) { 57 if(c < T2) 58 goto bad; 59 l = ((c << Bitx) | c1) & Rune2; 60 if(l <= Rune1) 61 goto bad; 62 *rune = l; 63 return 2; 64 } 65 66 /* 67 * three character sequence 68 * 00800-0FFFF => T3 Tx Tx 69 */ 70 c2 = *(uchar*)(str+2) ^ Tx; 71 72 if(c2 & Testx) 73 goto bad; 74 if(c < T4) { 75 l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3; 76 if(l <= Rune2) 77 goto bad; 78 if (SurrogateMin <= l && l <= SurrogateMax) 79 goto bad; 80 *rune = l; 81 return 3; 82 } 83 84 /* 85 * four character sequence 86 * 10000-10FFFF => T4 Tx Tx Tx 87 */ 88 if(UTFmax >= 4) { 89 c3 = *(uchar*)(str+3) ^ Tx; 90 if(c3 & Testx) 91 goto bad; 92 if(c < T5) { 93 l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4; 94 if(l <= Rune3) 95 goto bad; 96 if(l > Runemax) 97 goto bad; 98 *rune = l; 99 return 4; 100 } 101 } 102 103 /* 104 * bad decoding 105 */ 106 bad: 107 *rune = Bad; 108 return 1; 109 } 110 111 int 112 runetochar(char *str, Rune *rune) 113 { 114 long c; 115 116 /* 117 * one character sequence 118 * 00000-0007F => 00-7F 119 */ 120 c = *rune; 121 if(c <= Rune1) { 122 str[0] = c; 123 return 1; 124 } 125 126 /* 127 * two character sequence 128 * 0080-07FF => T2 Tx 129 */ 130 if(c <= Rune2) { 131 str[0] = T2 | (c >> 1*Bitx); 132 str[1] = Tx | (c & Maskx); 133 return 2; 134 } 135 /* 136 * If the Rune is out of range or a surrogate half, convert it to the error rune. 137 * Do this test here because the error rune encodes to three bytes. 138 * Doing it earlier would duplicate work, since an out of range 139 * Rune wouldn't have fit in one or two bytes. 140 */ 141 if (c > Runemax) 142 c = Runeerror; 143 if (SurrogateMin <= c && c <= SurrogateMax) 144 c = Runeerror; 145 146 /* 147 * three character sequence 148 * 0800-FFFF => T3 Tx Tx 149 */ 150 if (c <= Rune3) { 151 str[0] = T3 | (c >> 2*Bitx); 152 str[1] = Tx | ((c >> 1*Bitx) & Maskx); 153 str[2] = Tx | (c & Maskx); 154 return 3; 155 } 156 157 /* 158 * four character sequence (21-bit value) 159 * 10000-1FFFFF => T4 Tx Tx Tx 160 */ 161 str[0] = T4 | (c >> 3*Bitx); 162 str[1] = Tx | ((c >> 2*Bitx) & Maskx); 163 str[2] = Tx | ((c >> 1*Bitx) & Maskx); 164 str[3] = Tx | (c & Maskx); 165 return 4; 166 } 167 168 int 169 runelen(long c) 170 { 171 Rune rune; 172 char str[10]; 173 174 rune = c; 175 return runetochar(str, &rune); 176 } 177 178 int 179 runenlen(Rune *r, int nrune) 180 { 181 int nb, c; 182 183 nb = 0; 184 while(nrune--) { 185 c = *r++; 186 if(c <= Rune1) 187 nb++; 188 else if(c <= Rune2) 189 nb += 2; 190 else if(c <= Rune3) 191 nb += 3; 192 else 193 nb += 4; 194 } 195 return nb; 196 } 197 198 int 199 fullrune(char *str, int n) 200 { 201 int c; 202 if(n <= 0) 203 return 0; 204 c = *(uchar*)str; 205 if(c < Tx) 206 return 1; 207 if(c < T3) 208 return n >= 2; 209 if(c < T4) 210 return n >= 3; 211 return n >= 4; 212 } 213