140ef9009SDavid du Colombier /*
240ef9009SDavid du Colombier * The authors of this software are Rob Pike and Ken Thompson.
340ef9009SDavid du Colombier * Copyright (c) 2002 by Lucent Technologies.
440ef9009SDavid du Colombier * Permission to use, copy, modify, and distribute this software for any
540ef9009SDavid du Colombier * purpose without fee is hereby granted, provided that this entire notice
640ef9009SDavid du Colombier * is included in all copies of any software which is or includes a copy
740ef9009SDavid du Colombier * or modification of this software and in all copies of the supporting
840ef9009SDavid du Colombier * documentation for such software.
940ef9009SDavid du Colombier * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
1040ef9009SDavid du Colombier * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
1140ef9009SDavid du Colombier * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
1240ef9009SDavid du Colombier * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
1340ef9009SDavid du Colombier */
1440ef9009SDavid du Colombier #include <stdarg.h>
1540ef9009SDavid du Colombier #include <string.h>
1640ef9009SDavid du Colombier #include "utf.h"
1740ef9009SDavid du Colombier #include "utfdef.h"
1840ef9009SDavid du Colombier
1940ef9009SDavid du Colombier enum
2040ef9009SDavid du Colombier {
2140ef9009SDavid du Colombier Bit1 = 7,
2240ef9009SDavid du Colombier Bitx = 6,
2340ef9009SDavid du Colombier Bit2 = 5,
2440ef9009SDavid du Colombier Bit3 = 4,
2540ef9009SDavid du Colombier Bit4 = 3,
26*e94a8e9bSDavid du Colombier Bit5 = 2,
2740ef9009SDavid du Colombier
2840ef9009SDavid du Colombier T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */
2940ef9009SDavid du Colombier Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */
3040ef9009SDavid du Colombier T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */
3140ef9009SDavid du Colombier T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */
3240ef9009SDavid du Colombier T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */
33*e94a8e9bSDavid du Colombier T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */
3440ef9009SDavid du Colombier
35*e94a8e9bSDavid du Colombier Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0000 0000 0111 1111 */
36*e94a8e9bSDavid du Colombier Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0000 0000 0111 1111 1111 */
37*e94a8e9bSDavid du Colombier Rune3 = (1<<(Bit3+2*Bitx))-1, /* 0000 0000 1111 1111 1111 1111 */
38*e94a8e9bSDavid du Colombier Rune4 = (1<<(Bit4+3*Bitx))-1, /* 0001 1111 1111 1111 1111 1111 */
3940ef9009SDavid du Colombier
4040ef9009SDavid du Colombier Maskx = (1<<Bitx)-1, /* 0011 1111 */
4140ef9009SDavid du Colombier Testx = Maskx ^ 0xFF, /* 1100 0000 */
4240ef9009SDavid du Colombier
43*e94a8e9bSDavid du Colombier SurrogateMin = 0xD800,
44*e94a8e9bSDavid du Colombier SurrogateMax = 0xDFFF,
45*e94a8e9bSDavid du Colombier
4640ef9009SDavid du Colombier Bad = Runeerror,
4740ef9009SDavid du Colombier };
4840ef9009SDavid du Colombier
4940ef9009SDavid du Colombier int
chartorune(Rune * rune,char * str)5040ef9009SDavid du Colombier chartorune(Rune *rune, char *str)
5140ef9009SDavid du Colombier {
52*e94a8e9bSDavid du Colombier int c, c1, c2, c3;
5340ef9009SDavid du Colombier long l;
5440ef9009SDavid du Colombier
5540ef9009SDavid du Colombier /*
5640ef9009SDavid du Colombier * one character sequence
5740ef9009SDavid du Colombier * 00000-0007F => T1
5840ef9009SDavid du Colombier */
5940ef9009SDavid du Colombier c = *(uchar*)str;
6040ef9009SDavid du Colombier if(c < Tx) {
6140ef9009SDavid du Colombier *rune = c;
6240ef9009SDavid du Colombier return 1;
6340ef9009SDavid du Colombier }
6440ef9009SDavid du Colombier
6540ef9009SDavid du Colombier /*
6640ef9009SDavid du Colombier * two character sequence
67*e94a8e9bSDavid du Colombier * 00080-007FF => T2 Tx
6840ef9009SDavid du Colombier */
6940ef9009SDavid du Colombier c1 = *(uchar*)(str+1) ^ Tx;
7040ef9009SDavid du Colombier if(c1 & Testx)
7140ef9009SDavid du Colombier goto bad;
7240ef9009SDavid du Colombier if(c < T3) {
7340ef9009SDavid du Colombier if(c < T2)
7440ef9009SDavid du Colombier goto bad;
7540ef9009SDavid du Colombier l = ((c << Bitx) | c1) & Rune2;
7640ef9009SDavid du Colombier if(l <= Rune1)
7740ef9009SDavid du Colombier goto bad;
7840ef9009SDavid du Colombier *rune = l;
7940ef9009SDavid du Colombier return 2;
8040ef9009SDavid du Colombier }
8140ef9009SDavid du Colombier
8240ef9009SDavid du Colombier /*
8340ef9009SDavid du Colombier * three character sequence
84*e94a8e9bSDavid du Colombier * 00800-0FFFF => T3 Tx Tx
8540ef9009SDavid du Colombier */
8640ef9009SDavid du Colombier c2 = *(uchar*)(str+2) ^ Tx;
87*e94a8e9bSDavid du Colombier
8840ef9009SDavid du Colombier if(c2 & Testx)
8940ef9009SDavid du Colombier goto bad;
9040ef9009SDavid du Colombier if(c < T4) {
9140ef9009SDavid du Colombier l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
9240ef9009SDavid du Colombier if(l <= Rune2)
9340ef9009SDavid du Colombier goto bad;
94*e94a8e9bSDavid du Colombier if (SurrogateMin <= l && l <= SurrogateMax)
95*e94a8e9bSDavid du Colombier goto bad;
9640ef9009SDavid du Colombier *rune = l;
9740ef9009SDavid du Colombier return 3;
9840ef9009SDavid du Colombier }
9940ef9009SDavid du Colombier
10040ef9009SDavid du Colombier /*
101*e94a8e9bSDavid du Colombier * four character sequence
102*e94a8e9bSDavid du Colombier * 10000-10FFFF => T4 Tx Tx Tx
103*e94a8e9bSDavid du Colombier */
104*e94a8e9bSDavid du Colombier if(UTFmax >= 4) {
105*e94a8e9bSDavid du Colombier c3 = *(uchar*)(str+3) ^ Tx;
106*e94a8e9bSDavid du Colombier if(c3 & Testx)
107*e94a8e9bSDavid du Colombier goto bad;
108*e94a8e9bSDavid du Colombier if(c < T5) {
109*e94a8e9bSDavid du Colombier l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
110*e94a8e9bSDavid du Colombier if(l <= Rune3)
111*e94a8e9bSDavid du Colombier goto bad;
112*e94a8e9bSDavid du Colombier if(l > Runemax)
113*e94a8e9bSDavid du Colombier goto bad;
114*e94a8e9bSDavid du Colombier *rune = l;
115*e94a8e9bSDavid du Colombier return 4;
116*e94a8e9bSDavid du Colombier }
117*e94a8e9bSDavid du Colombier }
118*e94a8e9bSDavid du Colombier
119*e94a8e9bSDavid du Colombier /*
12040ef9009SDavid du Colombier * bad decoding
12140ef9009SDavid du Colombier */
12240ef9009SDavid du Colombier bad:
12340ef9009SDavid du Colombier *rune = Bad;
12440ef9009SDavid du Colombier return 1;
12540ef9009SDavid du Colombier }
12640ef9009SDavid du Colombier
12740ef9009SDavid du Colombier int
runetochar(char * str,Rune * rune)12840ef9009SDavid du Colombier runetochar(char *str, Rune *rune)
12940ef9009SDavid du Colombier {
13040ef9009SDavid du Colombier long c;
13140ef9009SDavid du Colombier
13240ef9009SDavid du Colombier /*
13340ef9009SDavid du Colombier * one character sequence
13440ef9009SDavid du Colombier * 00000-0007F => 00-7F
13540ef9009SDavid du Colombier */
13640ef9009SDavid du Colombier c = *rune;
13740ef9009SDavid du Colombier if(c <= Rune1) {
13840ef9009SDavid du Colombier str[0] = c;
13940ef9009SDavid du Colombier return 1;
14040ef9009SDavid du Colombier }
14140ef9009SDavid du Colombier
14240ef9009SDavid du Colombier /*
14340ef9009SDavid du Colombier * two character sequence
14440ef9009SDavid du Colombier * 0080-07FF => T2 Tx
14540ef9009SDavid du Colombier */
14640ef9009SDavid du Colombier if(c <= Rune2) {
14740ef9009SDavid du Colombier str[0] = T2 | (c >> 1*Bitx);
14840ef9009SDavid du Colombier str[1] = Tx | (c & Maskx);
14940ef9009SDavid du Colombier return 2;
15040ef9009SDavid du Colombier }
151*e94a8e9bSDavid du Colombier /*
152*e94a8e9bSDavid du Colombier * If the Rune is out of range or a surrogate half, convert it to the error rune.
153*e94a8e9bSDavid du Colombier * Do this test here because the error rune encodes to three bytes.
154*e94a8e9bSDavid du Colombier * Doing it earlier would duplicate work, since an out of range
155*e94a8e9bSDavid du Colombier * Rune wouldn't have fit in one or two bytes.
156*e94a8e9bSDavid du Colombier */
157*e94a8e9bSDavid du Colombier if (c > Runemax)
158*e94a8e9bSDavid du Colombier c = Runeerror;
159*e94a8e9bSDavid du Colombier if (SurrogateMin <= c && c <= SurrogateMax)
160*e94a8e9bSDavid du Colombier c = Runeerror;
16140ef9009SDavid du Colombier
16240ef9009SDavid du Colombier /*
16340ef9009SDavid du Colombier * three character sequence
16440ef9009SDavid du Colombier * 0800-FFFF => T3 Tx Tx
16540ef9009SDavid du Colombier */
166*e94a8e9bSDavid du Colombier if (c <= Rune3) {
16740ef9009SDavid du Colombier str[0] = T3 | (c >> 2*Bitx);
16840ef9009SDavid du Colombier str[1] = Tx | ((c >> 1*Bitx) & Maskx);
16940ef9009SDavid du Colombier str[2] = Tx | (c & Maskx);
17040ef9009SDavid du Colombier return 3;
17140ef9009SDavid du Colombier }
17240ef9009SDavid du Colombier
173*e94a8e9bSDavid du Colombier /*
174*e94a8e9bSDavid du Colombier * four character sequence (21-bit value)
175*e94a8e9bSDavid du Colombier * 10000-1FFFFF => T4 Tx Tx Tx
176*e94a8e9bSDavid du Colombier */
177*e94a8e9bSDavid du Colombier str[0] = T4 | (c >> 3*Bitx);
178*e94a8e9bSDavid du Colombier str[1] = Tx | ((c >> 2*Bitx) & Maskx);
179*e94a8e9bSDavid du Colombier str[2] = Tx | ((c >> 1*Bitx) & Maskx);
180*e94a8e9bSDavid du Colombier str[3] = Tx | (c & Maskx);
181*e94a8e9bSDavid du Colombier return 4;
182*e94a8e9bSDavid du Colombier }
183*e94a8e9bSDavid du Colombier
18440ef9009SDavid du Colombier int
runelen(long c)18540ef9009SDavid du Colombier runelen(long c)
18640ef9009SDavid du Colombier {
18740ef9009SDavid du Colombier Rune rune;
18840ef9009SDavid du Colombier char str[10];
18940ef9009SDavid du Colombier
19040ef9009SDavid du Colombier rune = c;
19140ef9009SDavid du Colombier return runetochar(str, &rune);
19240ef9009SDavid du Colombier }
19340ef9009SDavid du Colombier
19440ef9009SDavid du Colombier int
runenlen(Rune * r,int nrune)19540ef9009SDavid du Colombier runenlen(Rune *r, int nrune)
19640ef9009SDavid du Colombier {
19740ef9009SDavid du Colombier int nb, c;
19840ef9009SDavid du Colombier
19940ef9009SDavid du Colombier nb = 0;
20040ef9009SDavid du Colombier while(nrune--) {
20140ef9009SDavid du Colombier c = *r++;
20240ef9009SDavid du Colombier if(c <= Rune1)
20340ef9009SDavid du Colombier nb++;
204*e94a8e9bSDavid du Colombier else if(c <= Rune2)
20540ef9009SDavid du Colombier nb += 2;
206*e94a8e9bSDavid du Colombier else if(c <= Rune3)
20740ef9009SDavid du Colombier nb += 3;
208*e94a8e9bSDavid du Colombier else
209*e94a8e9bSDavid du Colombier nb += 4;
21040ef9009SDavid du Colombier }
21140ef9009SDavid du Colombier return nb;
21240ef9009SDavid du Colombier }
21340ef9009SDavid du Colombier
21440ef9009SDavid du Colombier int
fullrune(char * str,int n)21540ef9009SDavid du Colombier fullrune(char *str, int n)
21640ef9009SDavid du Colombier {
21740ef9009SDavid du Colombier int c;
218*e94a8e9bSDavid du Colombier if(n <= 0)
219*e94a8e9bSDavid du Colombier return 0;
22040ef9009SDavid du Colombier c = *(uchar*)str;
22140ef9009SDavid du Colombier if(c < Tx)
22240ef9009SDavid du Colombier return 1;
223*e94a8e9bSDavid du Colombier if(c < T3)
224*e94a8e9bSDavid du Colombier return n >= 2;
225*e94a8e9bSDavid du Colombier if(c < T4)
226*e94a8e9bSDavid du Colombier return n >= 3;
227*e94a8e9bSDavid du Colombier return n >= 4;
22840ef9009SDavid du Colombier }
229