1 #include "lib9.h" 2 3 #define Bit(i) (7-(i)) 4 /* N 0's preceded by i 1's, T(Bit(2)) is 1100 0000 */ 5 #define T(i) (((1 << (Bit(i)+1))-1) ^ 0xFF) 6 /* 0000 0000 0000 0111 1111 1111 */ 7 #define RuneX(i) ((1 << (Bit(i) + ((i)-1)*Bitx))-1) 8 9 enum 10 { 11 Bitx = Bit(1), 12 13 Tx = T(1), /* 1000 0000 */ 14 Rune1 = (1<<(Bit(0)+0*Bitx))-1, /* 0000 0000 0000 0000 0111 1111 */ 15 16 Maskx = (1<<Bitx)-1, /* 0011 1111 */ 17 Testx = Maskx ^ 0xFF, /* 1100 0000 */ 18 19 SurrogateMin = 0xD800, 20 SurrogateMax = 0xDFFF, 21 22 Bad = Runeerror, 23 }; 24 25 int 26 chartorune(Rune *rune, char *str) 27 { 28 int c[UTFmax], i; 29 Rune l; 30 31 /* 32 * N character sequence 33 * 00000-0007F => T1 34 * 00080-007FF => T2 Tx 35 * 00800-0FFFF => T3 Tx Tx 36 * 10000-10FFFF => T4 Tx Tx Tx 37 */ 38 39 c[0] = *(uchar*)(str); 40 if(c[0] < Tx){ 41 *rune = c[0]; 42 return 1; 43 } 44 l = c[0]; 45 46 for(i = 1; i < UTFmax; i++) { 47 c[i] = *(uchar*)(str+i); 48 c[i] ^= Tx; 49 if(c[i] & Testx) 50 goto bad; 51 l = (l << Bitx) | c[i]; 52 if(c[0] < T(i + 2)) { 53 l &= RuneX(i + 1); 54 if(i == 1) { 55 if(c[0] < T(2) || l <= Rune1) 56 goto bad; 57 } else if(l <= RuneX(i) || l > Runemax) 58 goto bad; 59 if (i == 2 && SurrogateMin <= l && l <= SurrogateMax) 60 goto bad; 61 *rune = l; 62 return i + 1; 63 } 64 } 65 66 /* 67 * bad decoding 68 */ 69 bad: 70 *rune = Bad; 71 return 1; 72 } 73 74 int 75 runetochar(char *str, Rune *rune) 76 { 77 int i, j; 78 Rune c; 79 80 c = *rune; 81 if(c <= Rune1) { 82 str[0] = c; 83 return 1; 84 } 85 86 /* 87 * one character sequence 88 * 00000-0007F => 00-7F 89 * two character sequence 90 * 0080-07FF => T2 Tx 91 * three character sequence 92 * 0800-FFFF => T3 Tx Tx 93 * four character sequence (21-bit value) 94 * 10000-1FFFFF => T4 Tx Tx Tx 95 * If the Rune is out of range or a surrogate half, 96 * convert it to the error rune. 97 * Do this test when i==3 because the error rune encodes to three bytes. 98 * Doing it earlier would duplicate work, since an out of range 99 * Rune wouldn't have fit in one or two bytes. 100 */ 101 for(i = 2; i < UTFmax + 1; i++){ 102 if(i == 3){ 103 if(c > Runemax) 104 c = Runeerror; 105 if(SurrogateMin <= c && c <= SurrogateMax) 106 c = Runeerror; 107 } 108 if (c <= RuneX(i) || i == UTFmax ) { 109 str[0] = T(i) | (c >> (i - 1)*Bitx); 110 for(j = 1; j < i; j++) 111 str[j] = Tx | ((c >> (i - j - 1)*Bitx) & Maskx); 112 return i; 113 } 114 } 115 return UTFmax; 116 } 117 118 int 119 runelen(long c) 120 { 121 Rune rune; 122 char str[10]; 123 124 rune = c; 125 return runetochar(str, &rune); 126 } 127 128 int 129 runenlen(Rune *r, int nrune) 130 { 131 int nb, i; 132 Rune c; 133 134 nb = 0; 135 while(nrune--) { 136 c = *r++; 137 if(c <= Rune1){ 138 nb++; 139 } else { 140 for(i = 2; i < UTFmax + 1; i++) 141 if(c <= RuneX(i) || i == UTFmax){ 142 nb += i; 143 break; 144 } 145 } 146 } 147 return nb; 148 } 149 150 int 151 fullrune(char *str, int n) 152 { 153 int i; 154 Rune c; 155 156 if(n <= 0) 157 return 0; 158 c = *(uchar*)str; 159 if(c < Tx) 160 return 1; 161 for(i = 3; i < UTFmax + 1; i++) 162 if(c < T(i)) 163 return n >= i - 1; 164 return n >= UTFmax; 165 } 166