xref: /inferno-os/libkern/rune.c (revision 3cd4f1d15146c08f05206d6328ecbc1c7fdc8dfa)
137da2899SCharles.Forsyth #include	"lib9.h"
237da2899SCharles.Forsyth 
3*3cd4f1d1SCharles Forsyth #define Bit(i) (7-(i))
4*3cd4f1d1SCharles Forsyth /* N 0's preceded by i 1's, T(Bit(2)) is 1100 0000 */
5*3cd4f1d1SCharles Forsyth #define T(i) (((1 << (Bit(i)+1))-1) ^ 0xFF)
6*3cd4f1d1SCharles Forsyth /* 0000 0000 0000 0111 1111 1111 */
7*3cd4f1d1SCharles Forsyth #define	RuneX(i) ((1 << (Bit(i) + ((i)-1)*Bitx))-1)
8*3cd4f1d1SCharles Forsyth 
937da2899SCharles.Forsyth enum
1037da2899SCharles.Forsyth {
11*3cd4f1d1SCharles Forsyth 	Bitx	= Bit(1),
1237da2899SCharles.Forsyth 
13*3cd4f1d1SCharles Forsyth 	Tx	= T(1),			/* 1000 0000 */
14*3cd4f1d1SCharles Forsyth 	Rune1 = (1<<(Bit(0)+0*Bitx))-1,	/* 0000 0000 0000 0000 0111 1111 */
1537da2899SCharles.Forsyth 
1637da2899SCharles.Forsyth 	Maskx	= (1<<Bitx)-1,		/* 0011 1111 */
1737da2899SCharles.Forsyth 	Testx	= Maskx ^ 0xFF,		/* 1100 0000 */
1837da2899SCharles.Forsyth 
19*3cd4f1d1SCharles Forsyth 	SurrogateMin	= 0xD800,
20*3cd4f1d1SCharles Forsyth 	SurrogateMax	= 0xDFFF,
21*3cd4f1d1SCharles Forsyth 
2237da2899SCharles.Forsyth 	Bad	= Runeerror,
2337da2899SCharles.Forsyth };
2437da2899SCharles.Forsyth 
2537da2899SCharles.Forsyth int
chartorune(Rune * rune,char * str)2637da2899SCharles.Forsyth chartorune(Rune *rune, char *str)
2737da2899SCharles.Forsyth {
28*3cd4f1d1SCharles Forsyth 	int c[UTFmax], i;
29*3cd4f1d1SCharles Forsyth 	Rune l;
3037da2899SCharles.Forsyth 
3137da2899SCharles.Forsyth 	/*
32*3cd4f1d1SCharles Forsyth 	 * N character sequence
3337da2899SCharles.Forsyth 	 *	00000-0007F => T1
34*3cd4f1d1SCharles Forsyth 	 *	00080-007FF => T2 Tx
35*3cd4f1d1SCharles Forsyth 	 *	00800-0FFFF => T3 Tx Tx
36*3cd4f1d1SCharles Forsyth 	 *	10000-10FFFF => T4 Tx Tx Tx
3737da2899SCharles.Forsyth 	 */
38*3cd4f1d1SCharles Forsyth 
39*3cd4f1d1SCharles Forsyth 	c[0] = *(uchar*)(str);
40*3cd4f1d1SCharles Forsyth 	if(c[0] < Tx){
41*3cd4f1d1SCharles Forsyth 		*rune = c[0];
4237da2899SCharles.Forsyth 		return 1;
4337da2899SCharles.Forsyth 	}
44*3cd4f1d1SCharles Forsyth 	l = c[0];
4537da2899SCharles.Forsyth 
46*3cd4f1d1SCharles Forsyth 	for(i = 1; i < UTFmax; i++) {
47*3cd4f1d1SCharles Forsyth 		c[i] = *(uchar*)(str+i);
48*3cd4f1d1SCharles Forsyth 		c[i] ^= Tx;
49*3cd4f1d1SCharles Forsyth 		if(c[i] & Testx)
5037da2899SCharles.Forsyth 			goto bad;
51*3cd4f1d1SCharles Forsyth 		l = (l << Bitx) | c[i];
52*3cd4f1d1SCharles Forsyth 		if(c[0] < T(i + 2)) {
53*3cd4f1d1SCharles Forsyth 			l &= RuneX(i + 1);
54*3cd4f1d1SCharles Forsyth 			if(i == 1) {
55*3cd4f1d1SCharles Forsyth 				if(c[0] < T(2) || l <= Rune1)
5637da2899SCharles.Forsyth 					goto bad;
57*3cd4f1d1SCharles Forsyth 			} else if(l <= RuneX(i) || l > Runemax)
58*3cd4f1d1SCharles Forsyth 				goto bad;
59*3cd4f1d1SCharles Forsyth 			if (i == 2 && SurrogateMin <= l && l <= SurrogateMax)
6037da2899SCharles.Forsyth 				goto bad;
6137da2899SCharles.Forsyth 			*rune = l;
62*3cd4f1d1SCharles Forsyth 			return i + 1;
6337da2899SCharles.Forsyth 		}
6437da2899SCharles.Forsyth 	}
6537da2899SCharles.Forsyth 
6637da2899SCharles.Forsyth 	/*
6737da2899SCharles.Forsyth 	 * bad decoding
6837da2899SCharles.Forsyth 	 */
6937da2899SCharles.Forsyth bad:
7037da2899SCharles.Forsyth 	*rune = Bad;
7137da2899SCharles.Forsyth 	return 1;
7237da2899SCharles.Forsyth }
7337da2899SCharles.Forsyth 
7437da2899SCharles.Forsyth int
runetochar(char * str,Rune * rune)7537da2899SCharles.Forsyth runetochar(char *str, Rune *rune)
7637da2899SCharles.Forsyth {
77*3cd4f1d1SCharles Forsyth 	int i, j;
78*3cd4f1d1SCharles Forsyth 	Rune c;
7937da2899SCharles.Forsyth 
8037da2899SCharles.Forsyth 	c = *rune;
8137da2899SCharles.Forsyth 	if(c <= Rune1) {
8237da2899SCharles.Forsyth 		str[0] = c;
8337da2899SCharles.Forsyth 		return 1;
8437da2899SCharles.Forsyth 	}
8537da2899SCharles.Forsyth 
8637da2899SCharles.Forsyth 	/*
87*3cd4f1d1SCharles Forsyth 	 * one character sequence
88*3cd4f1d1SCharles Forsyth 	 *	00000-0007F => 00-7F
8937da2899SCharles.Forsyth 	 * two character sequence
9037da2899SCharles.Forsyth 	 *	0080-07FF => T2 Tx
9137da2899SCharles.Forsyth 	 * three character sequence
9237da2899SCharles.Forsyth 	 *	0800-FFFF => T3 Tx Tx
93*3cd4f1d1SCharles Forsyth 	 * four character sequence (21-bit value)
94*3cd4f1d1SCharles Forsyth 	 *     10000-1FFFFF => T4 Tx Tx Tx
95*3cd4f1d1SCharles Forsyth 	 * If the Rune is out of range or a surrogate half,
96*3cd4f1d1SCharles Forsyth 	 * convert it to the error rune.
97*3cd4f1d1SCharles Forsyth 	 * Do this test when i==3 because the error rune encodes to three bytes.
98*3cd4f1d1SCharles Forsyth 	 * Doing it earlier would duplicate work, since an out of range
99*3cd4f1d1SCharles Forsyth 	 * Rune wouldn't have fit in one or two bytes.
10037da2899SCharles.Forsyth 	 */
101*3cd4f1d1SCharles Forsyth 	for(i = 2; i < UTFmax + 1; i++){
102*3cd4f1d1SCharles Forsyth 		if(i == 3){
103*3cd4f1d1SCharles Forsyth 			if(c > Runemax)
104*3cd4f1d1SCharles Forsyth 				c = Runeerror;
105*3cd4f1d1SCharles Forsyth 			if(SurrogateMin <= c && c <= SurrogateMax)
106*3cd4f1d1SCharles Forsyth 				c = Runeerror;
107*3cd4f1d1SCharles Forsyth 		}
108*3cd4f1d1SCharles Forsyth 		if (c <= RuneX(i) || i == UTFmax ) {
109*3cd4f1d1SCharles Forsyth 			str[0] = T(i) |  (c >> (i - 1)*Bitx);
110*3cd4f1d1SCharles Forsyth 			for(j = 1; j < i; j++)
111*3cd4f1d1SCharles Forsyth 				str[j] = Tx | ((c >> (i - j - 1)*Bitx) & Maskx);
112*3cd4f1d1SCharles Forsyth 			return i;
113*3cd4f1d1SCharles Forsyth 		}
114*3cd4f1d1SCharles Forsyth 	}
115*3cd4f1d1SCharles Forsyth 	return UTFmax;
11637da2899SCharles.Forsyth }
11737da2899SCharles.Forsyth 
11837da2899SCharles.Forsyth int
runelen(long c)11937da2899SCharles.Forsyth runelen(long c)
12037da2899SCharles.Forsyth {
121*3cd4f1d1SCharles Forsyth 	Rune rune;
122*3cd4f1d1SCharles Forsyth 	char str[10];
123*3cd4f1d1SCharles Forsyth 
124*3cd4f1d1SCharles Forsyth 	rune = c;
125*3cd4f1d1SCharles Forsyth 	return runetochar(str, &rune);
12637da2899SCharles.Forsyth }
12737da2899SCharles.Forsyth 
12837da2899SCharles.Forsyth int
runenlen(Rune * r,int nrune)129*3cd4f1d1SCharles Forsyth runenlen(Rune *r, int nrune)
13037da2899SCharles.Forsyth {
131*3cd4f1d1SCharles Forsyth 	int nb, i;
132*3cd4f1d1SCharles Forsyth 	Rune c;
13337da2899SCharles.Forsyth 
134*3cd4f1d1SCharles Forsyth 	nb = 0;
135*3cd4f1d1SCharles Forsyth 	while(nrune--) {
13637da2899SCharles.Forsyth 		c = *r++;
137*3cd4f1d1SCharles Forsyth 		if(c <= Rune1){
138*3cd4f1d1SCharles Forsyth 			nb++;
139*3cd4f1d1SCharles Forsyth 		} else {
140*3cd4f1d1SCharles Forsyth 			for(i = 2; i < UTFmax + 1; i++)
141*3cd4f1d1SCharles Forsyth 				if(c <= RuneX(i) || i == UTFmax){
142*3cd4f1d1SCharles Forsyth 					nb += i;
143*3cd4f1d1SCharles Forsyth 					break;
14437da2899SCharles.Forsyth 				}
145*3cd4f1d1SCharles Forsyth 		}
146*3cd4f1d1SCharles Forsyth 	}
147*3cd4f1d1SCharles Forsyth 	return nb;
14837da2899SCharles.Forsyth }
14937da2899SCharles.Forsyth 
15037da2899SCharles.Forsyth int
fullrune(char * str,int n)15137da2899SCharles.Forsyth fullrune(char *str, int n)
15237da2899SCharles.Forsyth {
153*3cd4f1d1SCharles Forsyth 	int  i;
154*3cd4f1d1SCharles Forsyth 	Rune c;
15537da2899SCharles.Forsyth 
156*3cd4f1d1SCharles Forsyth 	if(n <= 0)
157*3cd4f1d1SCharles Forsyth 		return 0;
15837da2899SCharles.Forsyth 	c = *(uchar*)str;
15937da2899SCharles.Forsyth 	if(c < Tx)
16037da2899SCharles.Forsyth 		return 1;
161*3cd4f1d1SCharles Forsyth 	for(i = 3; i < UTFmax + 1; i++)
162*3cd4f1d1SCharles Forsyth 		if(c < T(i))
163*3cd4f1d1SCharles Forsyth 			return n >= i - 1;
164*3cd4f1d1SCharles Forsyth 	return n >= UTFmax;
16537da2899SCharles.Forsyth }
166