xref: /netbsd-src/external/mit/lua/dist/src/lutf8lib.c (revision 80d9064ac03cbb6a4174695f0d5b237c8766d3d0)
1 /*	$NetBSD: lutf8lib.c,v 1.1.1.1 2014/07/20 23:17:39 lneto Exp $	*/
2 
3 /*
4 ** Id: lutf8lib.c,v 1.9 2014/05/14 18:33:37 roberto Exp
5 ** Standard library for UTF-8 manipulation
6 ** See Copyright Notice in lua.h
7 */
8 
9 
10 #include <assert.h>
11 #include <stdlib.h>
12 #include <string.h>
13 
14 #define lutf8lib_c
15 #define LUA_LIB
16 
17 #include "lua.h"
18 
19 #include "lauxlib.h"
20 #include "lualib.h"
21 
22 #define MAXUNICODE	0x10FFFF
23 
24 #define iscont(p)	((*(p) & 0xC0) == 0x80)
25 
26 
27 /* from strlib */
28 /* translate a relative string position: negative means back from end */
29 static lua_Integer u_posrelat (lua_Integer pos, size_t len) {
30   if (pos >= 0) return pos;
31   else if (0u - (size_t)pos > len) return 0;
32   else return (lua_Integer)len + pos + 1;
33 }
34 
35 
36 /*
37 ** Decode one UTF-8 sequence, returning NULL if byte sequence is invalid.
38 */
39 static const char *utf8_decode (const char *o, int *val) {
40   static unsigned int limits[] = {0xFF, 0x7F, 0x7FF, 0xFFFF};
41   const unsigned char *s = (const unsigned char *)o;
42   unsigned int c = s[0];
43   unsigned int res = 0;  /* final result */
44   if (c < 0x80)  /* ascii? */
45     res = c;
46   else {
47     int count = 0;  /* to count number of continuation bytes */
48     while (c & 0x40) {  /* still have continuation bytes? */
49       int cc = s[++count];  /* read next byte */
50       if ((cc & 0xC0) != 0x80)  /* not a continuation byte? */
51         return NULL;  /* invalid byte sequence */
52       res = (res << 6) | (cc & 0x3F);  /* add lower 6 bits from cont. byte */
53       c <<= 1;  /* to test next bit */
54     }
55     res |= ((c & 0x7F) << (count * 5));  /* add first byte */
56     if (count > 3 || res > MAXUNICODE || res <= limits[count])
57       return NULL;  /* invalid byte sequence */
58     s += count;  /* skip continuation bytes read */
59   }
60   if (val) *val = res;
61   return (const char *)s + 1;  /* +1 to include first byte */
62 }
63 
64 
65 /*
66 ** utf8len(s [, i [, j]]) --> number of characters that start in the
67 ** range [i,j], or nil + current position if 's' is not well formed in
68 ** that interval
69 */
70 static int utflen (lua_State *L) {
71   int n = 0;
72   size_t len;
73   const char *s = luaL_checklstring(L, 1, &len);
74   lua_Integer posi = u_posrelat(luaL_optinteger(L, 2, 1), len);
75   lua_Integer posj = u_posrelat(luaL_optinteger(L, 3, -1), len);
76   luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 2,
77                    "initial position out of string");
78   luaL_argcheck(L, --posj < (lua_Integer)len, 3,
79                    "final position out of string");
80   while (posi <= posj) {
81     const char *s1 = utf8_decode(s + posi, NULL);
82     if (s1 == NULL) {  /* conversion error? */
83       lua_pushnil(L);  /* return nil ... */
84       lua_pushinteger(L, posi + 1);  /* ... and current position */
85       return 2;
86     }
87     posi = s1 - s;
88     n++;
89   }
90   lua_pushinteger(L, n);
91   return 1;
92 }
93 
94 
95 /*
96 ** codepoint(s, [i, [j]])  -> returns codepoints for all characters
97 ** that start in the range [i,j]
98 */
99 static int codepoint (lua_State *L) {
100   size_t len;
101   const char *s = luaL_checklstring(L, 1, &len);
102   lua_Integer posi = u_posrelat(luaL_optinteger(L, 2, 1), len);
103   lua_Integer pose = u_posrelat(luaL_optinteger(L, 3, posi), len);
104   int n;
105   const char *se;
106   luaL_argcheck(L, posi >= 1, 2, "out of range");
107   luaL_argcheck(L, pose <= (lua_Integer)len, 3, "out of range");
108   if (posi > pose) return 0;  /* empty interval; return no values */
109   n = (int)(pose -  posi + 1);
110   if (posi + n <= pose)  /* (lua_Integer -> int) overflow? */
111     return luaL_error(L, "string slice too long");
112   luaL_checkstack(L, n, "string slice too long");
113   n = 0;
114   se = s + pose;
115   for (s += posi - 1; s < se;) {
116     int code;
117     s = utf8_decode(s, &code);
118     if (s == NULL)
119       return luaL_error(L, "invalid UTF-8 code");
120     lua_pushinteger(L, code);
121     n++;
122   }
123   return n;
124 }
125 
126 
127 static void pushutfchar (lua_State *L, int arg) {
128   int code = luaL_checkint(L, arg);
129   luaL_argcheck(L, 0 <= code && code <= MAXUNICODE, arg, "value out of range");
130   lua_pushfstring(L, "%U", code);
131 }
132 
133 
134 /*
135 ** utfchar(n1, n2, ...)  -> char(n1)..char(n2)...
136 */
137 static int utfchar (lua_State *L) {
138   int n = lua_gettop(L);  /* number of arguments */
139   if (n == 1)  /* optimize common case of single char */
140     pushutfchar(L, 1);
141   else {
142     int i;
143     luaL_Buffer b;
144     luaL_buffinit(L, &b);
145     for (i = 1; i <= n; i++) {
146       pushutfchar(L, i);
147       luaL_addvalue(&b);
148     }
149     luaL_pushresult(&b);
150   }
151   return 1;
152 }
153 
154 
155 /*
156 ** offset(s, n, [i])  -> index where n-th character counting from
157 **   position 'i' starts; 0 means character at 'i'.
158 */
159 static int byteoffset (lua_State *L) {
160   size_t len;
161   const char *s = luaL_checklstring(L, 1, &len);
162   int n  = luaL_checkint(L, 2);
163   lua_Integer posi = (n >= 0) ? 1 : len + 1;
164   posi = u_posrelat(luaL_optinteger(L, 3, posi), len);
165   luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 3,
166                    "position out of range");
167   if (n == 0) {
168     /* find beginning of current byte sequence */
169     while (posi > 0 && iscont(s + posi)) posi--;
170   }
171   else {
172     if (iscont(s + posi))
173       luaL_error(L, "initial position is a continuation byte");
174     if (n < 0) {
175        while (n < 0 && posi > 0) {  /* move back */
176          do {  /* find beginning of previous character */
177            posi--;
178          } while (posi > 0 && iscont(s + posi));
179          n++;
180        }
181      }
182      else {
183        n--;  /* do not move for 1st character */
184        while (n > 0 && posi < (lua_Integer)len) {
185          do {  /* find beginning of next character */
186            posi++;
187          } while (iscont(s + posi));  /* (cannot pass final '\0') */
188          n--;
189        }
190      }
191   }
192   if (n == 0)  /* did it find given character? */
193     lua_pushinteger(L, posi + 1);
194   else  /* no such character */
195     lua_pushnil(L);
196   return 1;
197 }
198 
199 
200 static int iter_aux (lua_State *L) {
201   size_t len;
202   const char *s = luaL_checklstring(L, 1, &len);
203   lua_Integer n = lua_tointeger(L, 2) - 1;
204   if (n < 0)  /* first iteration? */
205     n = 0;  /* start from here */
206   else if (n < (lua_Integer)len) {
207     n++;  /* skip current byte */
208     while (iscont(s + n)) n++;  /* and its continuations */
209   }
210   if (n >= (lua_Integer)len)
211     return 0;  /* no more codepoints */
212   else {
213     int code;
214     const char *next = utf8_decode(s + n, &code);
215     if (next == NULL || iscont(next))
216       return luaL_error(L, "invalid UTF-8 code");
217     lua_pushinteger(L, n + 1);
218     lua_pushinteger(L, code);
219     return 2;
220   }
221 }
222 
223 
224 static int iter_codes (lua_State *L) {
225   luaL_checkstring(L, 1);
226   lua_pushcfunction(L, iter_aux);
227   lua_pushvalue(L, 1);
228   lua_pushinteger(L, 0);
229   return 3;
230 }
231 
232 
233 /* pattern to match a single UTF-8 character */
234 #define UTF8PATT	"[\0-\x7F\xC2-\xF4][\x80-\xBF]*"
235 
236 
237 static struct luaL_Reg funcs[] = {
238   {"offset", byteoffset},
239   {"codepoint", codepoint},
240   {"char", utfchar},
241   {"len", utflen},
242   {"codes", iter_codes},
243   {NULL, NULL}
244 };
245 
246 
247 LUAMOD_API int luaopen_utf8 (lua_State *L) {
248   luaL_newlib(L, funcs);
249   lua_pushliteral(L, UTF8PATT);
250   lua_setfield(L, -2, "charpatt");
251   return 1;
252 }
253 
254