1 #include "lib9.h"
2
3 #define Bit(i) (7-(i))
4 /* N 0's preceded by i 1's, T(Bit(2)) is 1100 0000 */
5 #define T(i) (((1 << (Bit(i)+1))-1) ^ 0xFF)
6 /* 0000 0000 0000 0111 1111 1111 */
7 #define RuneX(i) ((1 << (Bit(i) + ((i)-1)*Bitx))-1)
8
9 enum
10 {
11 Bitx = Bit(1),
12
13 Tx = T(1), /* 1000 0000 */
14 Rune1 = (1<<(Bit(0)+0*Bitx))-1, /* 0000 0000 0000 0000 0111 1111 */
15
16 Maskx = (1<<Bitx)-1, /* 0011 1111 */
17 Testx = Maskx ^ 0xFF, /* 1100 0000 */
18
19 SurrogateMin = 0xD800,
20 SurrogateMax = 0xDFFF,
21
22 Bad = Runeerror,
23 };
24
25 int
chartorune(Rune * rune,char * str)26 chartorune(Rune *rune, char *str)
27 {
28 int c[UTFmax], i;
29 Rune l;
30
31 /*
32 * N character sequence
33 * 00000-0007F => T1
34 * 00080-007FF => T2 Tx
35 * 00800-0FFFF => T3 Tx Tx
36 * 10000-10FFFF => T4 Tx Tx Tx
37 */
38
39 c[0] = *(uchar*)(str);
40 if(c[0] < Tx){
41 *rune = c[0];
42 return 1;
43 }
44 l = c[0];
45
46 for(i = 1; i < UTFmax; i++) {
47 c[i] = *(uchar*)(str+i);
48 c[i] ^= Tx;
49 if(c[i] & Testx)
50 goto bad;
51 l = (l << Bitx) | c[i];
52 if(c[0] < T(i + 2)) {
53 l &= RuneX(i + 1);
54 if(i == 1) {
55 if(c[0] < T(2) || l <= Rune1)
56 goto bad;
57 } else if(l <= RuneX(i) || l > Runemax)
58 goto bad;
59 if (i == 2 && SurrogateMin <= l && l <= SurrogateMax)
60 goto bad;
61 *rune = l;
62 return i + 1;
63 }
64 }
65
66 /*
67 * bad decoding
68 */
69 bad:
70 *rune = Bad;
71 return 1;
72 }
73
74 int
runetochar(char * str,Rune * rune)75 runetochar(char *str, Rune *rune)
76 {
77 int i, j;
78 Rune c;
79
80 c = *rune;
81 if(c <= Rune1) {
82 str[0] = c;
83 return 1;
84 }
85
86 /*
87 * one character sequence
88 * 00000-0007F => 00-7F
89 * two character sequence
90 * 0080-07FF => T2 Tx
91 * three character sequence
92 * 0800-FFFF => T3 Tx Tx
93 * four character sequence (21-bit value)
94 * 10000-1FFFFF => T4 Tx Tx Tx
95 * If the Rune is out of range or a surrogate half,
96 * convert it to the error rune.
97 * Do this test when i==3 because the error rune encodes to three bytes.
98 * Doing it earlier would duplicate work, since an out of range
99 * Rune wouldn't have fit in one or two bytes.
100 */
101 for(i = 2; i < UTFmax + 1; i++){
102 if(i == 3){
103 if(c > Runemax)
104 c = Runeerror;
105 if(SurrogateMin <= c && c <= SurrogateMax)
106 c = Runeerror;
107 }
108 if (c <= RuneX(i) || i == UTFmax ) {
109 str[0] = T(i) | (c >> (i - 1)*Bitx);
110 for(j = 1; j < i; j++)
111 str[j] = Tx | ((c >> (i - j - 1)*Bitx) & Maskx);
112 return i;
113 }
114 }
115 return UTFmax;
116 }
117
118 int
runelen(long c)119 runelen(long c)
120 {
121 Rune rune;
122 char str[10];
123
124 rune = c;
125 return runetochar(str, &rune);
126 }
127
128 int
runenlen(Rune * r,int nrune)129 runenlen(Rune *r, int nrune)
130 {
131 int nb, i;
132 Rune c;
133
134 nb = 0;
135 while(nrune--) {
136 c = *r++;
137 if(c <= Rune1){
138 nb++;
139 } else {
140 for(i = 2; i < UTFmax + 1; i++)
141 if(c <= RuneX(i) || i == UTFmax){
142 nb += i;
143 break;
144 }
145 }
146 }
147 return nb;
148 }
149
150 int
fullrune(char * str,int n)151 fullrune(char *str, int n)
152 {
153 int i;
154 Rune c;
155
156 if(n <= 0)
157 return 0;
158 c = *(uchar*)str;
159 if(c < Tx)
160 return 1;
161 for(i = 3; i < UTFmax + 1; i++)
162 if(c < T(i))
163 return n >= i - 1;
164 return n >= UTFmax;
165 }
166