13e12c5d1SDavid du Colombier #include <u.h>
23e12c5d1SDavid du Colombier #include <libc.h>
33e12c5d1SDavid du Colombier
4*e94a8e9bSDavid du Colombier #define Bit(i) (7-(i))
5*e94a8e9bSDavid du Colombier /* N 0's preceded by i 1's, T(Bit(2)) is 1100 0000 */
6*e94a8e9bSDavid du Colombier #define T(i) (((1 << (Bit(i)+1))-1) ^ 0xFF)
7*e94a8e9bSDavid du Colombier /* 0000 0000 0000 0111 1111 1111 */
8*e94a8e9bSDavid du Colombier #define RuneX(i) ((1 << (Bit(i) + ((i)-1)*Bitx))-1)
9*e94a8e9bSDavid du Colombier
103e12c5d1SDavid du Colombier enum
113e12c5d1SDavid du Colombier {
12*e94a8e9bSDavid du Colombier Bitx = Bit(1),
133e12c5d1SDavid du Colombier
14*e94a8e9bSDavid du Colombier Tx = T(1), /* 1000 0000 */
15*e94a8e9bSDavid du Colombier Rune1 = (1<<(Bit(0)+0*Bitx))-1, /* 0000 0000 0000 0000 0111 1111 */
163e12c5d1SDavid du Colombier
173e12c5d1SDavid du Colombier Maskx = (1<<Bitx)-1, /* 0011 1111 */
183e12c5d1SDavid du Colombier Testx = Maskx ^ 0xFF, /* 1100 0000 */
193e12c5d1SDavid du Colombier
20*e94a8e9bSDavid du Colombier SurrogateMin = 0xD800,
21*e94a8e9bSDavid du Colombier SurrogateMax = 0xDFFF,
22*e94a8e9bSDavid du Colombier
233e12c5d1SDavid du Colombier Bad = Runeerror,
243e12c5d1SDavid du Colombier };
253e12c5d1SDavid du Colombier
263e12c5d1SDavid du Colombier int
chartorune(Rune * rune,char * str)273e12c5d1SDavid du Colombier chartorune(Rune *rune, char *str)
283e12c5d1SDavid du Colombier {
29*e94a8e9bSDavid du Colombier int c[UTFmax], i;
30*e94a8e9bSDavid du Colombier Rune l;
313e12c5d1SDavid du Colombier
323e12c5d1SDavid du Colombier /*
33*e94a8e9bSDavid du Colombier * N character sequence
343e12c5d1SDavid du Colombier * 00000-0007F => T1
35*e94a8e9bSDavid du Colombier * 00080-007FF => T2 Tx
36*e94a8e9bSDavid du Colombier * 00800-0FFFF => T3 Tx Tx
37*e94a8e9bSDavid du Colombier * 10000-10FFFF => T4 Tx Tx Tx
383e12c5d1SDavid du Colombier */
39*e94a8e9bSDavid du Colombier
40*e94a8e9bSDavid du Colombier c[0] = *(uchar*)(str);
41*e94a8e9bSDavid du Colombier if(c[0] < Tx){
42*e94a8e9bSDavid du Colombier *rune = c[0];
433e12c5d1SDavid du Colombier return 1;
443e12c5d1SDavid du Colombier }
45*e94a8e9bSDavid du Colombier l = c[0];
463e12c5d1SDavid du Colombier
47*e94a8e9bSDavid du Colombier for(i = 1; i < UTFmax; i++) {
48*e94a8e9bSDavid du Colombier c[i] = *(uchar*)(str+i);
49*e94a8e9bSDavid du Colombier c[i] ^= Tx;
50*e94a8e9bSDavid du Colombier if(c[i] & Testx)
513e12c5d1SDavid du Colombier goto bad;
52*e94a8e9bSDavid du Colombier l = (l << Bitx) | c[i];
53*e94a8e9bSDavid du Colombier if(c[0] < T(i + 2)) {
54*e94a8e9bSDavid du Colombier l &= RuneX(i + 1);
55*e94a8e9bSDavid du Colombier if(i == 1) {
56*e94a8e9bSDavid du Colombier if(c[0] < T(2) || l <= Rune1)
573e12c5d1SDavid du Colombier goto bad;
58*e94a8e9bSDavid du Colombier } else if(l <= RuneX(i) || l > Runemax)
59*e94a8e9bSDavid du Colombier goto bad;
60*e94a8e9bSDavid du Colombier if (i == 2 && SurrogateMin <= l && l <= SurrogateMax)
613e12c5d1SDavid du Colombier goto bad;
623e12c5d1SDavid du Colombier *rune = l;
63*e94a8e9bSDavid du Colombier return i + 1;
643e12c5d1SDavid du Colombier }
653e12c5d1SDavid du Colombier }
663e12c5d1SDavid du Colombier
673e12c5d1SDavid du Colombier /*
683e12c5d1SDavid du Colombier * bad decoding
693e12c5d1SDavid du Colombier */
703e12c5d1SDavid du Colombier bad:
713e12c5d1SDavid du Colombier *rune = Bad;
723e12c5d1SDavid du Colombier return 1;
733e12c5d1SDavid du Colombier }
743e12c5d1SDavid du Colombier
753e12c5d1SDavid du Colombier int
runetochar(char * str,Rune * rune)763e12c5d1SDavid du Colombier runetochar(char *str, Rune *rune)
773e12c5d1SDavid du Colombier {
78*e94a8e9bSDavid du Colombier int i, j;
79*e94a8e9bSDavid du Colombier Rune c;
803e12c5d1SDavid du Colombier
813e12c5d1SDavid du Colombier c = *rune;
823e12c5d1SDavid du Colombier if(c <= Rune1) {
833e12c5d1SDavid du Colombier str[0] = c;
843e12c5d1SDavid du Colombier return 1;
853e12c5d1SDavid du Colombier }
863e12c5d1SDavid du Colombier
873e12c5d1SDavid du Colombier /*
88*e94a8e9bSDavid du Colombier * one character sequence
89*e94a8e9bSDavid du Colombier * 00000-0007F => 00-7F
903e12c5d1SDavid du Colombier * two character sequence
913e12c5d1SDavid du Colombier * 0080-07FF => T2 Tx
923e12c5d1SDavid du Colombier * three character sequence
933e12c5d1SDavid du Colombier * 0800-FFFF => T3 Tx Tx
94*e94a8e9bSDavid du Colombier * four character sequence (21-bit value)
95*e94a8e9bSDavid du Colombier * 10000-1FFFFF => T4 Tx Tx Tx
96*e94a8e9bSDavid du Colombier * If the Rune is out of range or a surrogate half,
97*e94a8e9bSDavid du Colombier * convert it to the error rune.
98*e94a8e9bSDavid du Colombier * Do this test when i==3 because the error rune encodes to three bytes.
99*e94a8e9bSDavid du Colombier * Doing it earlier would duplicate work, since an out of range
100*e94a8e9bSDavid du Colombier * Rune wouldn't have fit in one or two bytes.
1013e12c5d1SDavid du Colombier */
102*e94a8e9bSDavid du Colombier for(i = 2; i < UTFmax + 1; i++){
103*e94a8e9bSDavid du Colombier if(i == 3){
104*e94a8e9bSDavid du Colombier if(c > Runemax)
105*e94a8e9bSDavid du Colombier c = Runeerror;
106*e94a8e9bSDavid du Colombier if(SurrogateMin <= c && c <= SurrogateMax)
107*e94a8e9bSDavid du Colombier c = Runeerror;
108*e94a8e9bSDavid du Colombier }
109*e94a8e9bSDavid du Colombier if (c <= RuneX(i) || i == UTFmax ) {
110*e94a8e9bSDavid du Colombier str[0] = T(i) | (c >> (i - 1)*Bitx);
111*e94a8e9bSDavid du Colombier for(j = 1; j < i; j++)
112*e94a8e9bSDavid du Colombier str[j] = Tx | ((c >> (i - j - 1)*Bitx) & Maskx);
113*e94a8e9bSDavid du Colombier return i;
114*e94a8e9bSDavid du Colombier }
115*e94a8e9bSDavid du Colombier }
116*e94a8e9bSDavid du Colombier return UTFmax;
1173e12c5d1SDavid du Colombier }
1183e12c5d1SDavid du Colombier
1193e12c5d1SDavid du Colombier int
runelen(long c)1203e12c5d1SDavid du Colombier runelen(long c)
1213e12c5d1SDavid du Colombier {
1223e12c5d1SDavid du Colombier Rune rune;
1233e12c5d1SDavid du Colombier char str[10];
1243e12c5d1SDavid du Colombier
1253e12c5d1SDavid du Colombier rune = c;
1263e12c5d1SDavid du Colombier return runetochar(str, &rune);
1273e12c5d1SDavid du Colombier }
1283e12c5d1SDavid du Colombier
1293e12c5d1SDavid du Colombier int
runenlen(Rune * r,int nrune)1307dd7cddfSDavid du Colombier runenlen(Rune *r, int nrune)
1317dd7cddfSDavid du Colombier {
132*e94a8e9bSDavid du Colombier int nb, i;
133*e94a8e9bSDavid du Colombier Rune c;
1347dd7cddfSDavid du Colombier
1357dd7cddfSDavid du Colombier nb = 0;
1367dd7cddfSDavid du Colombier while(nrune--) {
1377dd7cddfSDavid du Colombier c = *r++;
138*e94a8e9bSDavid du Colombier if(c <= Rune1){
1397dd7cddfSDavid du Colombier nb++;
140*e94a8e9bSDavid du Colombier } else {
141*e94a8e9bSDavid du Colombier for(i = 2; i < UTFmax + 1; i++)
142*e94a8e9bSDavid du Colombier if(c <= RuneX(i) || i == UTFmax){
143*e94a8e9bSDavid du Colombier nb += i;
144*e94a8e9bSDavid du Colombier break;
145*e94a8e9bSDavid du Colombier }
146*e94a8e9bSDavid du Colombier }
1477dd7cddfSDavid du Colombier }
1487dd7cddfSDavid du Colombier return nb;
1497dd7cddfSDavid du Colombier }
1507dd7cddfSDavid du Colombier
1517dd7cddfSDavid du Colombier int
fullrune(char * str,int n)1523e12c5d1SDavid du Colombier fullrune(char *str, int n)
1533e12c5d1SDavid du Colombier {
154*e94a8e9bSDavid du Colombier int i;
155*e94a8e9bSDavid du Colombier Rune c;
1563e12c5d1SDavid du Colombier
157*e94a8e9bSDavid du Colombier if(n <= 0)
158*e94a8e9bSDavid du Colombier return 0;
1593e12c5d1SDavid du Colombier c = *(uchar*)str;
1603e12c5d1SDavid du Colombier if(c < Tx)
1613e12c5d1SDavid du Colombier return 1;
162*e94a8e9bSDavid du Colombier for(i = 3; i < UTFmax + 1; i++)
163*e94a8e9bSDavid du Colombier if(c < T(i))
164*e94a8e9bSDavid du Colombier return n >= i - 1;
165*e94a8e9bSDavid du Colombier return n >= UTFmax;
1663e12c5d1SDavid du Colombier }
167