xref: /plan9/sys/src/cmd/unix/u9fs/rune.c (revision 82726826a7b3d40fb66339b4b0e95b60314f98b9)
1 #include	<plan9.h>
2 
3 char *argv0;
4 enum
5 {
6 	Bit1	= 7,
7 	Bitx	= 6,
8 	Bit2	= 5,
9 	Bit3	= 4,
10 	Bit4	= 3,
11 
12 	T1	= ((1<<(Bit1+1))-1) ^ 0xFF,	/* 0000 0000 */
13 	Tx	= ((1<<(Bitx+1))-1) ^ 0xFF,	/* 1000 0000 */
14 	T2	= ((1<<(Bit2+1))-1) ^ 0xFF,	/* 1100 0000 */
15 	T3	= ((1<<(Bit3+1))-1) ^ 0xFF,	/* 1110 0000 */
16 	T4	= ((1<<(Bit4+1))-1) ^ 0xFF,	/* 1111 0000 */
17 	T5	= ((1<<(Bit5+1))-1) ^ 0xFF,	/* 1111 1000 */
18 
19 	Rune1	= (1<<(Bit1+0*Bitx))-1,		/* 0000 0000 0000 0000 0111 1111 */
20 	Rune2	= (1<<(Bit2+1*Bitx))-1,		/* 0000 0000 0000 0111 1111 1111 */
21 	Rune3	= (1<<(Bit3+2*Bitx))-1,		/* 0000 0000 1111 1111 1111 1111 */
22 	Rune4	= (1<<(Bit4+3*Bitx))-1,		/* 0001 1111 1111 1111 1111 1111 */
23 
24 
25 	Maskx	= (1<<Bitx)-1,			/* 0011 1111 */
26 	Testx	= Maskx ^ 0xFF,			/* 1100 0000 */
27 
28 	SurrogateMin	= 0xD800,
29 	SurrogateMax	= 0xDFFF,
30 
31 	Bad	= Runeerror
32 };
33 
34 int
chartorune(Rune * rune,char * str)35 chartorune(Rune *rune, char *str)
36 {
37 	int c, c1, c2, c3;
38 	long l;
39 
40 	/*
41 	 * one character sequence
42 	 *	00000-0007F => T1
43 	 */
44 	c = *(uchar*)str;
45 	if(c < Tx) {
46 		*rune = c;
47 		return 1;
48 	}
49 
50 	/*
51 	 * two character sequence
52 	 *	00080-007FF => T2 Tx
53 	 */
54 	c1 = *(uchar*)(str+1) ^ Tx;
55 	if(c1 & Testx)
56 		goto bad;
57 	if(c < T3) {
58 		if(c < T2)
59 			goto bad;
60 		l = ((c << Bitx) | c1) & Rune2;
61 		if(l <= Rune1)
62 			goto bad;
63 		*rune = l;
64 		return 2;
65 	}
66 
67 	/*
68 	 * three character sequence
69 	 *	00800-0FFFF => T3 Tx Tx
70 	 */
71 	c2 = *(uchar*)(str+2) ^ Tx;
72 
73 	if(c2 & Testx)
74 		goto bad;
75 	if(c < T4) {
76 		l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
77 		if(l <= Rune2)
78 			goto bad;
79 		if (SurrogateMin <= l && l <= SurrogateMax)
80 			goto bad;
81 		*rune = l;
82 		return 3;
83 	}
84 
85 	/*
86 	 * four character sequence
87 	 *	10000-10FFFF => T4 Tx Tx Tx
88 	 */
89 	if(UTFmax >= 4) {
90 		c3 = *(uchar*)(str+3) ^ Tx;
91 		if(c3 & Testx)
92 			goto bad;
93 		if(c < T5) {
94 			l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
95 			if(l <= Rune3)
96 				goto bad;
97 			if(l > Runemax)
98 				goto bad;
99 			*rune = l;
100 			return 4;
101 		}
102 	}
103 
104 	/*
105 	 * bad decoding
106 	 */
107 bad:
108 	*rune = Bad;
109 	return 1;
110 }
111 
112 int
runetochar(char * str,Rune * rune)113 runetochar(char *str, Rune *rune)
114 {
115 	long c;
116 
117 	/*
118 	 * one character sequence
119 	 *	00000-0007F => 00-7F
120 	 */
121 	c = *rune;
122 	if(c <= Rune1) {
123 		str[0] = c;
124 		return 1;
125 	}
126 
127 	/*
128 	 * two character sequence
129 	 *	0080-07FF => T2 Tx
130 	 */
131 	if(c <= Rune2) {
132 		str[0] = T2 | (c >> 1*Bitx);
133 		str[1] = Tx | (c & Maskx);
134 		return 2;
135 	}
136 	/*
137 	 * If the Rune is out of range or a surrogate half, convert it to the error rune.
138 	 * Do this test here because the error rune encodes to three bytes.
139 	 * Doing it earlier would duplicate work, since an out of range
140 	 * Rune wouldn't have fit in one or two bytes.
141 	 */
142 	if (c > Runemax)
143 		c = Runeerror;
144 	if (SurrogateMin <= c && c <= SurrogateMax)
145 		c = Runeerror;
146 
147 	/*
148 	 * three character sequence
149 	 *	0800-FFFF => T3 Tx Tx
150 	 */
151 	if (c <= Rune3) {
152 		str[0] = T3 |  (c >> 2*Bitx);
153 		str[1] = Tx | ((c >> 1*Bitx) & Maskx);
154 		str[2] = Tx |  (c & Maskx);
155 		return 3;
156 	}
157 
158 	/*
159 	 * four character sequence (21-bit value)
160 	 *     10000-1FFFFF => T4 Tx Tx Tx
161 	 */
162 	str[0] = T4 | (c >> 3*Bitx);
163 	str[1] = Tx | ((c >> 2*Bitx) & Maskx);
164 	str[2] = Tx | ((c >> 1*Bitx) & Maskx);
165 	str[3] = Tx | (c & Maskx);
166 	return 4;
167 }
168 
169 int
runelen(long c)170 runelen(long c)
171 {
172 	Rune rune;
173 	char str[10];
174 
175 	rune = c;
176 	return runetochar(str, &rune);
177 }
178 
179 int
utflen(char * s)180 utflen(char *s)
181 {
182 	int c;
183 	long n;
184 	Rune rune;
185 
186 	n = 0;
187 	for(;;) {
188 		c = *(uchar*)s;
189 		if(c < Runeself) {
190 			if(c == 0)
191 				return n;
192 			s++;
193 		} else
194 			s += chartorune(&rune, s);
195 		n++;
196 	}
197 	return 0;
198 }
199