1 #include <plan9.h> 2 3 char *argv0; 4 enum 5 { 6 Bit1 = 7, 7 Bitx = 6, 8 Bit2 = 5, 9 Bit3 = 4, 10 Bit4 = 3, 11 12 T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */ 13 Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */ 14 T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */ 15 T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */ 16 T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */ 17 T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */ 18 19 Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0000 0000 0111 1111 */ 20 Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0000 0000 0111 1111 1111 */ 21 Rune3 = (1<<(Bit3+2*Bitx))-1, /* 0000 0000 1111 1111 1111 1111 */ 22 Rune4 = (1<<(Bit4+3*Bitx))-1, /* 0001 1111 1111 1111 1111 1111 */ 23 24 25 Maskx = (1<<Bitx)-1, /* 0011 1111 */ 26 Testx = Maskx ^ 0xFF, /* 1100 0000 */ 27 28 SurrogateMin = 0xD800, 29 SurrogateMax = 0xDFFF, 30 31 Bad = Runeerror 32 }; 33 34 int 35 chartorune(Rune *rune, char *str) 36 { 37 int c, c1, c2, c3; 38 long l; 39 40 /* 41 * one character sequence 42 * 00000-0007F => T1 43 */ 44 c = *(uchar*)str; 45 if(c < Tx) { 46 *rune = c; 47 return 1; 48 } 49 50 /* 51 * two character sequence 52 * 00080-007FF => T2 Tx 53 */ 54 c1 = *(uchar*)(str+1) ^ Tx; 55 if(c1 & Testx) 56 goto bad; 57 if(c < T3) { 58 if(c < T2) 59 goto bad; 60 l = ((c << Bitx) | c1) & Rune2; 61 if(l <= Rune1) 62 goto bad; 63 *rune = l; 64 return 2; 65 } 66 67 /* 68 * three character sequence 69 * 00800-0FFFF => T3 Tx Tx 70 */ 71 c2 = *(uchar*)(str+2) ^ Tx; 72 73 if(c2 & Testx) 74 goto bad; 75 if(c < T4) { 76 l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3; 77 if(l <= Rune2) 78 goto bad; 79 if (SurrogateMin <= l && l <= SurrogateMax) 80 goto bad; 81 *rune = l; 82 return 3; 83 } 84 85 /* 86 * four character sequence 87 * 10000-10FFFF => T4 Tx Tx Tx 88 */ 89 if(UTFmax >= 4) { 90 c3 = *(uchar*)(str+3) ^ Tx; 91 if(c3 & Testx) 92 goto bad; 93 if(c < T5) { 94 l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4; 95 if(l <= Rune3) 96 goto bad; 97 if(l > Runemax) 98 goto bad; 99 *rune = l; 100 return 4; 101 } 102 } 103 104 /* 105 * bad decoding 106 */ 107 bad: 108 *rune = Bad; 109 return 1; 110 } 111 112 int 113 runetochar(char *str, Rune *rune) 114 { 115 long c; 116 117 /* 118 * one character sequence 119 * 00000-0007F => 00-7F 120 */ 121 c = *rune; 122 if(c <= Rune1) { 123 str[0] = c; 124 return 1; 125 } 126 127 /* 128 * two character sequence 129 * 0080-07FF => T2 Tx 130 */ 131 if(c <= Rune2) { 132 str[0] = T2 | (c >> 1*Bitx); 133 str[1] = Tx | (c & Maskx); 134 return 2; 135 } 136 /* 137 * If the Rune is out of range or a surrogate half, convert it to the error rune. 138 * Do this test here because the error rune encodes to three bytes. 139 * Doing it earlier would duplicate work, since an out of range 140 * Rune wouldn't have fit in one or two bytes. 141 */ 142 if (c > Runemax) 143 c = Runeerror; 144 if (SurrogateMin <= c && c <= SurrogateMax) 145 c = Runeerror; 146 147 /* 148 * three character sequence 149 * 0800-FFFF => T3 Tx Tx 150 */ 151 if (c <= Rune3) { 152 str[0] = T3 | (c >> 2*Bitx); 153 str[1] = Tx | ((c >> 1*Bitx) & Maskx); 154 str[2] = Tx | (c & Maskx); 155 return 3; 156 } 157 158 /* 159 * four character sequence (21-bit value) 160 * 10000-1FFFFF => T4 Tx Tx Tx 161 */ 162 str[0] = T4 | (c >> 3*Bitx); 163 str[1] = Tx | ((c >> 2*Bitx) & Maskx); 164 str[2] = Tx | ((c >> 1*Bitx) & Maskx); 165 str[3] = Tx | (c & Maskx); 166 return 4; 167 } 168 169 int 170 runelen(long c) 171 { 172 Rune rune; 173 char str[10]; 174 175 rune = c; 176 return runetochar(str, &rune); 177 } 178 179 int 180 utflen(char *s) 181 { 182 int c; 183 long n; 184 Rune rune; 185 186 n = 0; 187 for(;;) { 188 c = *(uchar*)s; 189 if(c < Runeself) { 190 if(c == 0) 191 return n; 192 s++; 193 } else 194 s += chartorune(&rune, s); 195 n++; 196 } 197 return 0; 198 } 199