xref: /plan9/sys/src/cmd/unix/drawterm/libc/rune.c (revision a1216cc64119db675aa140f55fbd73eb2414b763)
1 #include	<u.h>
2 #include	<libc.h>
3 
4 #define Bit(i) (7-(i))
5 /* N 0's preceded by i 1's, T(Bit(2)) is 1100 0000 */
6 #define T(i) (((1 << (Bit(i)+1))-1) ^ 0xFF)
7 /* 0000 0000 0000 0111 1111 1111 */
8 #define	RuneX(i) ((1 << (Bit(i) + ((i)-1)*Bitx))-1)
9 
10 enum
11 {
12 	Bitx	= Bit(1),
13 
14 	Tx	= T(1),			/* 1000 0000 */
15 	Rune1 = (1<<(Bit(0)+0*Bitx))-1,	/* 0000 0000 0000 0000 0111 1111 */
16 
17 	Maskx	= (1<<Bitx)-1,		/* 0011 1111 */
18 	Testx	= Maskx ^ 0xFF,		/* 1100 0000 */
19 
20 	SurrogateMin	= 0xD800,
21 	SurrogateMax	= 0xDFFF,
22 
23 	Bad	= Runeerror,
24 };
25 
26 int
chartorune(Rune * rune,char * str)27 chartorune(Rune *rune, char *str)
28 {
29 	int c[UTFmax], i;
30 	Rune l;
31 
32 	/*
33 	 * N character sequence
34 	 *	00000-0007F => T1
35 	 *	00080-007FF => T2 Tx
36 	 *	00800-0FFFF => T3 Tx Tx
37 	 *	10000-10FFFF => T4 Tx Tx Tx
38 	 */
39 
40 	c[0] = *(uchar*)(str);
41 	if(c[0] < Tx){
42 		*rune = c[0];
43 		return 1;
44 	}
45 	l = c[0];
46 
47 	for(i = 1; i < UTFmax; i++) {
48 		c[i] = *(uchar*)(str+i);
49 		c[i] ^= Tx;
50 		if(c[i] & Testx)
51 			goto bad;
52 		l = (l << Bitx) | c[i];
53 		if(c[0] < T(i + 2)) {
54 			l &= RuneX(i + 1);
55 			if(i == 1) {
56 				if(c[0] < T(2) || l <= Rune1)
57 					goto bad;
58 			} else if(l <= RuneX(i) || l > Runemax)
59 				goto bad;
60 			if (i == 2 && SurrogateMin <= l && l <= SurrogateMax)
61 				goto bad;
62 			*rune = l;
63 			return i + 1;
64 		}
65 	}
66 
67 	/*
68 	 * bad decoding
69 	 */
70 bad:
71 	*rune = Bad;
72 	return 1;
73 }
74 
75 int
runetochar(char * str,Rune * rune)76 runetochar(char *str, Rune *rune)
77 {
78 	int i, j;
79 	Rune c;
80 
81 	c = *rune;
82 	if(c <= Rune1) {
83 		str[0] = c;
84 		return 1;
85 	}
86 
87 	/*
88 	 * one character sequence
89 	 *	00000-0007F => 00-7F
90 	 * two character sequence
91 	 *	0080-07FF => T2 Tx
92 	 * three character sequence
93 	 *	0800-FFFF => T3 Tx Tx
94 	 * four character sequence (21-bit value)
95 	 *     10000-1FFFFF => T4 Tx Tx Tx
96 	 * If the Rune is out of range or a surrogate half,
97 	 * convert it to the error rune.
98 	 * Do this test when i==3 because the error rune encodes to three bytes.
99 	 * Doing it earlier would duplicate work, since an out of range
100 	 * Rune wouldn't have fit in one or two bytes.
101 	 */
102 	for(i = 2; i < UTFmax + 1; i++){
103 		if(i == 3){
104 			if(c > Runemax)
105 				c = Runeerror;
106 			if(SurrogateMin <= c && c <= SurrogateMax)
107 				c = Runeerror;
108 		}
109 		if (c <= RuneX(i) || i == UTFmax ) {
110 			str[0] = T(i) |  (c >> (i - 1)*Bitx);
111 			for(j = 1; j < i; j++)
112 				str[j] = Tx | ((c >> (i - j - 1)*Bitx) & Maskx);
113 			return i;
114 		}
115 	}
116 	return UTFmax;
117 }
118 
119 int
runelen(long c)120 runelen(long c)
121 {
122 	Rune rune;
123 	char str[10];
124 
125 	rune = c;
126 	return runetochar(str, &rune);
127 }
128 
129 int
runenlen(Rune * r,int nrune)130 runenlen(Rune *r, int nrune)
131 {
132 	int nb, i;
133 	Rune c;
134 
135 	nb = 0;
136 	while(nrune--) {
137 		c = *r++;
138 		if(c <= Rune1){
139 			nb++;
140 		} else {
141 			for(i = 2; i < UTFmax + 1; i++)
142 				if(c <= RuneX(i) || i == UTFmax){
143 					nb += i;
144 					break;
145 				}
146 		}
147 	}
148 	return nb;
149 }
150 
151 int
fullrune(char * str,int n)152 fullrune(char *str, int n)
153 {
154 	int  i;
155 	Rune c;
156 
157 	if(n <= 0)
158 		return 0;
159 	c = *(uchar*)str;
160 	if(c < Tx)
161 		return 1;
162 	for(i = 3; i < UTFmax + 1; i++)
163 		if(c < T(i))
164 			return n >= i - 1;
165 	return n >= UTFmax;
166 }
167