1 #include <u.h>
2 #include <libc.h>
3
4 #define Bit(i) (7-(i))
5 /* N 0's preceded by i 1's, T(Bit(2)) is 1100 0000 */
6 #define T(i) (((1 << (Bit(i)+1))-1) ^ 0xFF)
7 /* 0000 0000 0000 0111 1111 1111 */
8 #define RuneX(i) ((1 << (Bit(i) + ((i)-1)*Bitx))-1)
9
10 enum
11 {
12 Bitx = Bit(1),
13
14 Tx = T(1), /* 1000 0000 */
15 Rune1 = (1<<(Bit(0)+0*Bitx))-1, /* 0000 0000 0000 0000 0111 1111 */
16
17 Maskx = (1<<Bitx)-1, /* 0011 1111 */
18 Testx = Maskx ^ 0xFF, /* 1100 0000 */
19
20 SurrogateMin = 0xD800,
21 SurrogateMax = 0xDFFF,
22
23 Bad = Runeerror,
24 };
25
26 int
chartorune(Rune * rune,char * str)27 chartorune(Rune *rune, char *str)
28 {
29 int c[UTFmax], i;
30 Rune l;
31
32 /*
33 * N character sequence
34 * 00000-0007F => T1
35 * 00080-007FF => T2 Tx
36 * 00800-0FFFF => T3 Tx Tx
37 * 10000-10FFFF => T4 Tx Tx Tx
38 */
39
40 c[0] = *(uchar*)(str);
41 if(c[0] < Tx){
42 *rune = c[0];
43 return 1;
44 }
45 l = c[0];
46
47 for(i = 1; i < UTFmax; i++) {
48 c[i] = *(uchar*)(str+i);
49 c[i] ^= Tx;
50 if(c[i] & Testx)
51 goto bad;
52 l = (l << Bitx) | c[i];
53 if(c[0] < T(i + 2)) {
54 l &= RuneX(i + 1);
55 if(i == 1) {
56 if(c[0] < T(2) || l <= Rune1)
57 goto bad;
58 } else if(l <= RuneX(i) || l > Runemax)
59 goto bad;
60 if (i == 2 && SurrogateMin <= l && l <= SurrogateMax)
61 goto bad;
62 *rune = l;
63 return i + 1;
64 }
65 }
66
67 /*
68 * bad decoding
69 */
70 bad:
71 *rune = Bad;
72 return 1;
73 }
74
75 int
runetochar(char * str,Rune * rune)76 runetochar(char *str, Rune *rune)
77 {
78 int i, j;
79 Rune c;
80
81 c = *rune;
82 if(c <= Rune1) {
83 str[0] = c;
84 return 1;
85 }
86
87 /*
88 * one character sequence
89 * 00000-0007F => 00-7F
90 * two character sequence
91 * 0080-07FF => T2 Tx
92 * three character sequence
93 * 0800-FFFF => T3 Tx Tx
94 * four character sequence (21-bit value)
95 * 10000-1FFFFF => T4 Tx Tx Tx
96 * If the Rune is out of range or a surrogate half,
97 * convert it to the error rune.
98 * Do this test when i==3 because the error rune encodes to three bytes.
99 * Doing it earlier would duplicate work, since an out of range
100 * Rune wouldn't have fit in one or two bytes.
101 */
102 for(i = 2; i < UTFmax + 1; i++){
103 if(i == 3){
104 if(c > Runemax)
105 c = Runeerror;
106 if(SurrogateMin <= c && c <= SurrogateMax)
107 c = Runeerror;
108 }
109 if (c <= RuneX(i) || i == UTFmax ) {
110 str[0] = T(i) | (c >> (i - 1)*Bitx);
111 for(j = 1; j < i; j++)
112 str[j] = Tx | ((c >> (i - j - 1)*Bitx) & Maskx);
113 return i;
114 }
115 }
116 return UTFmax;
117 }
118
119 int
runelen(long c)120 runelen(long c)
121 {
122 Rune rune;
123 char str[10];
124
125 rune = c;
126 return runetochar(str, &rune);
127 }
128
129 int
runenlen(Rune * r,int nrune)130 runenlen(Rune *r, int nrune)
131 {
132 int nb, i;
133 Rune c;
134
135 nb = 0;
136 while(nrune--) {
137 c = *r++;
138 if(c <= Rune1){
139 nb++;
140 } else {
141 for(i = 2; i < UTFmax + 1; i++)
142 if(c <= RuneX(i) || i == UTFmax){
143 nb += i;
144 break;
145 }
146 }
147 }
148 return nb;
149 }
150
151 int
fullrune(char * str,int n)152 fullrune(char *str, int n)
153 {
154 int i;
155 Rune c;
156
157 if(n <= 0)
158 return 0;
159 c = *(uchar*)str;
160 if(c < Tx)
161 return 1;
162 for(i = 3; i < UTFmax + 1; i++)
163 if(c < T(i))
164 return n >= i - 1;
165 return n >= UTFmax;
166 }
167