1 #include <u.h> 2 #include <libc.h> 3 4 #define Bit(i) (7-(i)) 5 /* N 0's preceded by i 1's, T(Bit(2)) is 1100 0000 */ 6 #define T(i) (((1 << (Bit(i)+1))-1) ^ 0xFF) 7 /* 0000 0000 0000 0111 1111 1111 */ 8 #define RuneX(i) ((1 << (Bit(i) + ((i)-1)*Bitx))-1) 9 10 enum 11 { 12 Bitx = Bit(1), 13 14 Tx = T(1), /* 1000 0000 */ 15 Rune1 = (1<<(Bit(0)+0*Bitx))-1, /* 0000 0000 0000 0000 0111 1111 */ 16 17 Maskx = (1<<Bitx)-1, /* 0011 1111 */ 18 Testx = Maskx ^ 0xFF, /* 1100 0000 */ 19 20 SurrogateMin = 0xD800, 21 SurrogateMax = 0xDFFF, 22 23 Bad = Runeerror, 24 }; 25 26 int 27 chartorune(Rune *rune, char *str) 28 { 29 int c[UTFmax], i; 30 Rune l; 31 32 /* 33 * N character sequence 34 * 00000-0007F => T1 35 * 00080-007FF => T2 Tx 36 * 00800-0FFFF => T3 Tx Tx 37 * 10000-10FFFF => T4 Tx Tx Tx 38 */ 39 40 c[0] = *(uchar*)(str); 41 if(c[0] < Tx){ 42 *rune = c[0]; 43 return 1; 44 } 45 l = c[0]; 46 47 for(i = 1; i < UTFmax; i++) { 48 c[i] = *(uchar*)(str+i); 49 c[i] ^= Tx; 50 if(c[i] & Testx) 51 goto bad; 52 l = (l << Bitx) | c[i]; 53 if(c[0] < T(i + 2)) { 54 l &= RuneX(i + 1); 55 if(i == 1) { 56 if(c[0] < T(2) || l <= Rune1) 57 goto bad; 58 } else if(l <= RuneX(i) || l > Runemax) 59 goto bad; 60 if (i == 2 && SurrogateMin <= l && l <= SurrogateMax) 61 goto bad; 62 *rune = l; 63 return i + 1; 64 } 65 } 66 67 /* 68 * bad decoding 69 */ 70 bad: 71 *rune = Bad; 72 return 1; 73 } 74 75 int 76 runetochar(char *str, Rune *rune) 77 { 78 int i, j; 79 Rune c; 80 81 c = *rune; 82 if(c <= Rune1) { 83 str[0] = c; 84 return 1; 85 } 86 87 /* 88 * one character sequence 89 * 00000-0007F => 00-7F 90 * two character sequence 91 * 0080-07FF => T2 Tx 92 * three character sequence 93 * 0800-FFFF => T3 Tx Tx 94 * four character sequence (21-bit value) 95 * 10000-1FFFFF => T4 Tx Tx Tx 96 * If the Rune is out of range or a surrogate half, 97 * convert it to the error rune. 98 * Do this test when i==3 because the error rune encodes to three bytes. 99 * Doing it earlier would duplicate work, since an out of range 100 * Rune wouldn't have fit in one or two bytes. 101 */ 102 for(i = 2; i < UTFmax + 1; i++){ 103 if(i == 3){ 104 if(c > Runemax) 105 c = Runeerror; 106 if(SurrogateMin <= c && c <= SurrogateMax) 107 c = Runeerror; 108 } 109 if (c <= RuneX(i) || i == UTFmax ) { 110 str[0] = T(i) | (c >> (i - 1)*Bitx); 111 for(j = 1; j < i; j++) 112 str[j] = Tx | ((c >> (i - j - 1)*Bitx) & Maskx); 113 return i; 114 } 115 } 116 return UTFmax; 117 } 118 119 int 120 runelen(long c) 121 { 122 Rune rune; 123 char str[10]; 124 125 rune = c; 126 return runetochar(str, &rune); 127 } 128 129 int 130 runenlen(Rune *r, int nrune) 131 { 132 int nb, i; 133 Rune c; 134 135 nb = 0; 136 while(nrune--) { 137 c = *r++; 138 if(c <= Rune1){ 139 nb++; 140 } else { 141 for(i = 2; i < UTFmax + 1; i++) 142 if(c <= RuneX(i) || i == UTFmax){ 143 nb += i; 144 break; 145 } 146 } 147 } 148 return nb; 149 } 150 151 int 152 fullrune(char *str, int n) 153 { 154 int i; 155 Rune c; 156 157 if(n <= 0) 158 return 0; 159 c = *(uchar*)str; 160 if(c < Tx) 161 return 1; 162 for(i = 3; i < UTFmax + 1; i++) 163 if(c < T(i)) 164 return n >= i - 1; 165 return n >= UTFmax; 166 } 167