18ccd4a63SDavid du Colombier #include <u.h>
28ccd4a63SDavid du Colombier #include <libc.h>
38ccd4a63SDavid du Colombier
4*a1216cc6SDavid du Colombier #define Bit(i) (7-(i))
5*a1216cc6SDavid du Colombier /* N 0's preceded by i 1's, T(Bit(2)) is 1100 0000 */
6*a1216cc6SDavid du Colombier #define T(i) (((1 << (Bit(i)+1))-1) ^ 0xFF)
7*a1216cc6SDavid du Colombier /* 0000 0000 0000 0111 1111 1111 */
8*a1216cc6SDavid du Colombier #define RuneX(i) ((1 << (Bit(i) + ((i)-1)*Bitx))-1)
9*a1216cc6SDavid du Colombier
108ccd4a63SDavid du Colombier enum
118ccd4a63SDavid du Colombier {
12*a1216cc6SDavid du Colombier Bitx = Bit(1),
138ccd4a63SDavid du Colombier
14*a1216cc6SDavid du Colombier Tx = T(1), /* 1000 0000 */
15*a1216cc6SDavid du Colombier Rune1 = (1<<(Bit(0)+0*Bitx))-1, /* 0000 0000 0000 0000 0111 1111 */
168ccd4a63SDavid du Colombier
178ccd4a63SDavid du Colombier Maskx = (1<<Bitx)-1, /* 0011 1111 */
188ccd4a63SDavid du Colombier Testx = Maskx ^ 0xFF, /* 1100 0000 */
198ccd4a63SDavid du Colombier
2082726826SDavid du Colombier SurrogateMin = 0xD800,
2182726826SDavid du Colombier SurrogateMax = 0xDFFF,
2282726826SDavid du Colombier
238ccd4a63SDavid du Colombier Bad = Runeerror,
248ccd4a63SDavid du Colombier };
258ccd4a63SDavid du Colombier
268ccd4a63SDavid du Colombier int
chartorune(Rune * rune,char * str)278ccd4a63SDavid du Colombier chartorune(Rune *rune, char *str)
288ccd4a63SDavid du Colombier {
29*a1216cc6SDavid du Colombier int c[UTFmax], i;
30*a1216cc6SDavid du Colombier Rune l;
318ccd4a63SDavid du Colombier
328ccd4a63SDavid du Colombier /*
33*a1216cc6SDavid du Colombier * N character sequence
348ccd4a63SDavid du Colombier * 00000-0007F => T1
3582726826SDavid du Colombier * 00080-007FF => T2 Tx
3682726826SDavid du Colombier * 00800-0FFFF => T3 Tx Tx
3782726826SDavid du Colombier * 10000-10FFFF => T4 Tx Tx Tx
3882726826SDavid du Colombier */
39*a1216cc6SDavid du Colombier
40*a1216cc6SDavid du Colombier c[0] = *(uchar*)(str);
41*a1216cc6SDavid du Colombier if(c[0] < Tx){
42*a1216cc6SDavid du Colombier *rune = c[0];
43*a1216cc6SDavid du Colombier return 1;
44*a1216cc6SDavid du Colombier }
45*a1216cc6SDavid du Colombier l = c[0];
46*a1216cc6SDavid du Colombier
47*a1216cc6SDavid du Colombier for(i = 1; i < UTFmax; i++) {
48*a1216cc6SDavid du Colombier c[i] = *(uchar*)(str+i);
49*a1216cc6SDavid du Colombier c[i] ^= Tx;
50*a1216cc6SDavid du Colombier if(c[i] & Testx)
5182726826SDavid du Colombier goto bad;
52*a1216cc6SDavid du Colombier l = (l << Bitx) | c[i];
53*a1216cc6SDavid du Colombier if(c[0] < T(i + 2)) {
54*a1216cc6SDavid du Colombier l &= RuneX(i + 1);
55*a1216cc6SDavid du Colombier if(i == 1) {
56*a1216cc6SDavid du Colombier if(c[0] < T(2) || l <= Rune1)
5782726826SDavid du Colombier goto bad;
58*a1216cc6SDavid du Colombier } else if(l <= RuneX(i) || l > Runemax)
59*a1216cc6SDavid du Colombier goto bad;
60*a1216cc6SDavid du Colombier if (i == 2 && SurrogateMin <= l && l <= SurrogateMax)
6182726826SDavid du Colombier goto bad;
6282726826SDavid du Colombier *rune = l;
63*a1216cc6SDavid du Colombier return i + 1;
6482726826SDavid du Colombier }
6582726826SDavid du Colombier }
6682726826SDavid du Colombier
6782726826SDavid du Colombier /*
688ccd4a63SDavid du Colombier * bad decoding
698ccd4a63SDavid du Colombier */
708ccd4a63SDavid du Colombier bad:
718ccd4a63SDavid du Colombier *rune = Bad;
728ccd4a63SDavid du Colombier return 1;
738ccd4a63SDavid du Colombier }
748ccd4a63SDavid du Colombier
758ccd4a63SDavid du Colombier int
runetochar(char * str,Rune * rune)768ccd4a63SDavid du Colombier runetochar(char *str, Rune *rune)
778ccd4a63SDavid du Colombier {
78*a1216cc6SDavid du Colombier int i, j;
79*a1216cc6SDavid du Colombier Rune c;
808ccd4a63SDavid du Colombier
818ccd4a63SDavid du Colombier c = *rune;
828ccd4a63SDavid du Colombier if(c <= Rune1) {
838ccd4a63SDavid du Colombier str[0] = c;
848ccd4a63SDavid du Colombier return 1;
858ccd4a63SDavid du Colombier }
868ccd4a63SDavid du Colombier
878ccd4a63SDavid du Colombier /*
88*a1216cc6SDavid du Colombier * one character sequence
89*a1216cc6SDavid du Colombier * 00000-0007F => 00-7F
908ccd4a63SDavid du Colombier * two character sequence
918ccd4a63SDavid du Colombier * 0080-07FF => T2 Tx
92*a1216cc6SDavid du Colombier * three character sequence
93*a1216cc6SDavid du Colombier * 0800-FFFF => T3 Tx Tx
94*a1216cc6SDavid du Colombier * four character sequence (21-bit value)
95*a1216cc6SDavid du Colombier * 10000-1FFFFF => T4 Tx Tx Tx
96*a1216cc6SDavid du Colombier * If the Rune is out of range or a surrogate half,
97*a1216cc6SDavid du Colombier * convert it to the error rune.
98*a1216cc6SDavid du Colombier * Do this test when i==3 because the error rune encodes to three bytes.
9982726826SDavid du Colombier * Doing it earlier would duplicate work, since an out of range
10082726826SDavid du Colombier * Rune wouldn't have fit in one or two bytes.
10182726826SDavid du Colombier */
102*a1216cc6SDavid du Colombier for(i = 2; i < UTFmax + 1; i++){
103*a1216cc6SDavid du Colombier if(i == 3){
10482726826SDavid du Colombier if(c > Runemax)
10582726826SDavid du Colombier c = Runeerror;
10682726826SDavid du Colombier if(SurrogateMin <= c && c <= SurrogateMax)
10782726826SDavid du Colombier c = Runeerror;
1088ccd4a63SDavid du Colombier }
109*a1216cc6SDavid du Colombier if (c <= RuneX(i) || i == UTFmax ) {
110*a1216cc6SDavid du Colombier str[0] = T(i) | (c >> (i - 1)*Bitx);
111*a1216cc6SDavid du Colombier for(j = 1; j < i; j++)
112*a1216cc6SDavid du Colombier str[j] = Tx | ((c >> (i - j - 1)*Bitx) & Maskx);
113*a1216cc6SDavid du Colombier return i;
114*a1216cc6SDavid du Colombier }
115*a1216cc6SDavid du Colombier }
116*a1216cc6SDavid du Colombier return UTFmax;
11782726826SDavid du Colombier }
11882726826SDavid du Colombier
1198ccd4a63SDavid du Colombier int
runelen(long c)1208ccd4a63SDavid du Colombier runelen(long c)
1218ccd4a63SDavid du Colombier {
1228ccd4a63SDavid du Colombier Rune rune;
1238ccd4a63SDavid du Colombier char str[10];
1248ccd4a63SDavid du Colombier
1258ccd4a63SDavid du Colombier rune = c;
1268ccd4a63SDavid du Colombier return runetochar(str, &rune);
1278ccd4a63SDavid du Colombier }
1288ccd4a63SDavid du Colombier
1298ccd4a63SDavid du Colombier int
runenlen(Rune * r,int nrune)1308ccd4a63SDavid du Colombier runenlen(Rune *r, int nrune)
1318ccd4a63SDavid du Colombier {
132*a1216cc6SDavid du Colombier int nb, i;
133*a1216cc6SDavid du Colombier Rune c;
1348ccd4a63SDavid du Colombier
1358ccd4a63SDavid du Colombier nb = 0;
1368ccd4a63SDavid du Colombier while(nrune--) {
1378ccd4a63SDavid du Colombier c = *r++;
138*a1216cc6SDavid du Colombier if(c <= Rune1){
1398ccd4a63SDavid du Colombier nb++;
140*a1216cc6SDavid du Colombier } else {
141*a1216cc6SDavid du Colombier for(i = 2; i < UTFmax + 1; i++)
142*a1216cc6SDavid du Colombier if(c <= RuneX(i) || i == UTFmax){
143*a1216cc6SDavid du Colombier nb += i;
144*a1216cc6SDavid du Colombier break;
145*a1216cc6SDavid du Colombier }
146*a1216cc6SDavid du Colombier }
1478ccd4a63SDavid du Colombier }
1488ccd4a63SDavid du Colombier return nb;
1498ccd4a63SDavid du Colombier }
1508ccd4a63SDavid du Colombier
1518ccd4a63SDavid du Colombier int
fullrune(char * str,int n)1528ccd4a63SDavid du Colombier fullrune(char *str, int n)
1538ccd4a63SDavid du Colombier {
154*a1216cc6SDavid du Colombier int i;
155*a1216cc6SDavid du Colombier Rune c;
156*a1216cc6SDavid du Colombier
15782726826SDavid du Colombier if(n <= 0)
15882726826SDavid du Colombier return 0;
1598ccd4a63SDavid du Colombier c = *(uchar*)str;
1608ccd4a63SDavid du Colombier if(c < Tx)
1618ccd4a63SDavid du Colombier return 1;
162*a1216cc6SDavid du Colombier for(i = 3; i < UTFmax + 1; i++)
163*a1216cc6SDavid du Colombier if(c < T(i))
164*a1216cc6SDavid du Colombier return n >= i - 1;
165*a1216cc6SDavid du Colombier return n >= UTFmax;
1668ccd4a63SDavid du Colombier }
167