xref: /inferno-os/lib9/rune.c (revision 7ded4a527bdfd0e8b3a9049955f2af89e5f039ee)
1 #include	"lib9.h"
2 
3 #define Bit(i) (7-(i))
4 /* N 0's preceded by i 1's, T(Bit(2)) is 1100 0000 */
5 #define T(i) (((1 << (Bit(i)+1))-1) ^ 0xFF)
6 /* 0000 0000 0000 0111 1111 1111 */
7 #define	RuneX(i) ((1 << (Bit(i) + ((i)-1)*Bitx))-1)
8 
9 enum
10 {
11 	Bitx	= Bit(1),
12 
13 	Tx	= T(1),			/* 1000 0000 */
14 	Rune1 = (1<<(Bit(0)+0*Bitx))-1,	/* 0000 0000 0000 0000 0111 1111 */
15 
16 	Maskx	= (1<<Bitx)-1,		/* 0011 1111 */
17 	Testx	= Maskx ^ 0xFF,		/* 1100 0000 */
18 
19 	SurrogateMin	= 0xD800,
20 	SurrogateMax	= 0xDFFF,
21 
22 	Bad	= Runeerror,
23 };
24 
25 int
chartorune(Rune * rune,char * str)26 chartorune(Rune *rune, char *str)
27 {
28 	int c[UTFmax], i;
29 	Rune l;
30 
31 	/*
32 	 * N character sequence
33 	 *	00000-0007F => T1
34 	 *	00080-007FF => T2 Tx
35 	 *	00800-0FFFF => T3 Tx Tx
36 	 *	10000-10FFFF => T4 Tx Tx Tx
37 	 */
38 
39 	c[0] = *(uchar*)(str);
40 	if(c[0] < Tx){
41 		*rune = c[0];
42 		return 1;
43 	}
44 	l = c[0];
45 
46 	for(i = 1; i < UTFmax; i++) {
47 		c[i] = *(uchar*)(str+i);
48 		c[i] ^= Tx;
49 		if(c[i] & Testx)
50 			goto bad;
51 		l = (l << Bitx) | c[i];
52 		if(c[0] < T(i + 2)) {
53 			l &= RuneX(i + 1);
54 			if(i == 1) {
55 				if(c[0] < T(2) || l <= Rune1)
56 					goto bad;
57 			} else if(l <= RuneX(i) || l > Runemax)
58 				goto bad;
59 			if (i == 2 && SurrogateMin <= l && l <= SurrogateMax)
60 				goto bad;
61 			*rune = l;
62 			return i + 1;
63 		}
64 	}
65 
66 	/*
67 	 * bad decoding
68 	 */
69 bad:
70 	*rune = Bad;
71 	return 1;
72 }
73 
74 int
runetochar(char * str,Rune * rune)75 runetochar(char *str, Rune *rune)
76 {
77 	int i, j;
78 	Rune c;
79 
80 	c = *rune;
81 	if(c <= Rune1) {
82 		str[0] = c;
83 		return 1;
84 	}
85 
86 	/*
87 	 * one character sequence
88 	 *	00000-0007F => 00-7F
89 	 * two character sequence
90 	 *	0080-07FF => T2 Tx
91 	 * three character sequence
92 	 *	0800-FFFF => T3 Tx Tx
93 	 * four character sequence (21-bit value)
94 	 *     10000-1FFFFF => T4 Tx Tx Tx
95 	 * If the Rune is out of range or a surrogate half,
96 	 * convert it to the error rune.
97 	 * Do this test when i==3 because the error rune encodes to three bytes.
98 	 * Doing it earlier would duplicate work, since an out of range
99 	 * Rune wouldn't have fit in one or two bytes.
100 	 */
101 	for(i = 2; i < UTFmax + 1; i++){
102 		if(i == 3){
103 			if(c > Runemax)
104 				c = Runeerror;
105 			if(SurrogateMin <= c && c <= SurrogateMax)
106 				c = Runeerror;
107 		}
108 		if (c <= RuneX(i) || i == UTFmax ) {
109 			str[0] = T(i) |  (c >> (i - 1)*Bitx);
110 			for(j = 1; j < i; j++)
111 				str[j] = Tx | ((c >> (i - j - 1)*Bitx) & Maskx);
112 			return i;
113 		}
114 	}
115 	return UTFmax;
116 }
117 
118 int
runelen(long c)119 runelen(long c)
120 {
121 	Rune rune;
122 	char str[10];
123 
124 	rune = c;
125 	return runetochar(str, &rune);
126 }
127 
128 int
runenlen(Rune * r,int nrune)129 runenlen(Rune *r, int nrune)
130 {
131 	int nb, i;
132 	Rune c;
133 
134 	nb = 0;
135 	while(nrune--) {
136 		c = *r++;
137 		if(c <= Rune1){
138 			nb++;
139 		} else {
140 			for(i = 2; i < UTFmax + 1; i++)
141 				if(c <= RuneX(i) || i == UTFmax){
142 					nb += i;
143 					break;
144 				}
145 		}
146 	}
147 	return nb;
148 }
149 
150 int
fullrune(char * str,int n)151 fullrune(char *str, int n)
152 {
153 	int  i;
154 	Rune c;
155 
156 	if(n <= 0)
157 		return 0;
158 	c = *(uchar*)str;
159 	if(c < Tx)
160 		return 1;
161 	for(i = 3; i < UTFmax + 1; i++)
162 		if(c < T(i))
163 			return n >= i - 1;
164 	return n >= UTFmax;
165 }
166