14887Schin /***********************************************************************
24887Schin * *
34887Schin * This software is part of the ast package *
4*12068SRoger.Faulkner@Oracle.COM * Copyright (c) 1985-2010 AT&T Intellectual Property *
54887Schin * and is licensed under the *
64887Schin * Common Public License, Version 1.0 *
78462SApril.Chin@Sun.COM * by AT&T Intellectual Property *
84887Schin * *
94887Schin * A copy of the License is available at *
104887Schin * http://www.opensource.org/licenses/cpl1.0.txt *
114887Schin * (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9) *
124887Schin * *
134887Schin * Information and Software Systems Research *
144887Schin * AT&T Research *
154887Schin * Florham Park NJ *
164887Schin * *
174887Schin * Glenn Fowler <gsf@research.att.com> *
184887Schin * David Korn <dgk@research.att.com> *
194887Schin * Phong Vo <kpv@research.att.com> *
204887Schin * *
214887Schin ***********************************************************************/
224887Schin #pragma prototyped
234887Schin /*
244887Schin * regex collation symbol support
254887Schin */
264887Schin
274887Schin #include "reglib.h"
284887Schin
294887Schin #include <ccode.h>
304887Schin
314887Schin #ifndef UCS_BYTE
324887Schin #define UCS_BYTE 1
334887Schin #endif
344887Schin
354887Schin #include "ucs_names.h"
364887Schin
374887Schin typedef struct Ucs_map_s
384887Schin {
394887Schin Ucs_attr_t attr[3];
404887Schin Ucs_code_t code;
414887Schin const char* name;
424887Schin Dtlink_t link;
434887Schin struct Ucs_map_s* next;
444887Schin } Ucs_map_t;
454887Schin
464887Schin #define setattr(a,i) ((a)[(i)>>5]|=(1<<((i)&((1<<5)-1))))
474887Schin #define tstattr(a,i) ((a)[(i)>>5]&(1<<((i)&((1<<5)-1))))
484887Schin #define clrattr(a,i) ((a)[(i)>>5]&=~(1<<((i)&((1<<5)-1))))
494887Schin
504887Schin static struct Local_s
514887Schin {
524887Schin int fatal;
534887Schin Dt_t* attrs;
544887Schin Dt_t* names;
554887Schin Dtdisc_t dtdisc;
564887Schin #if CC_NATIVE != CC_ASCII
574887Schin unsigned char* a2n;
584887Schin #endif
594887Schin } local;
604887Schin
614887Schin /*
624887Schin * initialize the writeable tables from the readonly data
634887Schin * the tables are big enough to be concerned about text vs. data vs. bss
644887Schin * UCS_BYTE==0 100K
654887Schin * UCS_BYTE==1 20K
664887Schin */
674887Schin
684887Schin static int
initialize(void)694887Schin initialize(void)
704887Schin {
714887Schin register int i;
724887Schin register Ucs_map_t* a;
734887Schin register Ucs_map_t* w;
744887Schin
754887Schin if (local.fatal)
764887Schin return -1;
774887Schin local.dtdisc.link = offsetof(Ucs_map_t, link);
784887Schin local.dtdisc.key = offsetof(Ucs_map_t, name);
794887Schin local.dtdisc.size = -1;
804887Schin if (!(w = (Ucs_map_t*)malloc(sizeof(Ucs_map_t) * (elementsof(ucs_attrs) + elementsof(ucs_names)))))
814887Schin {
824887Schin local.fatal = 1;
834887Schin return -1;
844887Schin }
854887Schin if (!(local.attrs = dtopen(&local.dtdisc, Dttree)))
864887Schin {
874887Schin free(w);
884887Schin local.fatal = 1;
894887Schin return -1;
904887Schin }
914887Schin if (!(local.names = dtopen(&local.dtdisc, Dttree)))
924887Schin {
934887Schin free(w);
944887Schin dtclose(local.attrs);
954887Schin local.fatal = 1;
964887Schin return -1;
974887Schin }
984887Schin for (i = 0; i < elementsof(ucs_attrs); i++, w++)
994887Schin {
1004887Schin memcpy(w, &ucs_attrs[i], offsetof(Ucs_dat_t, table));
1014887Schin w->name = ucs_strings[ucs_attrs[i].table] + ucs_attrs[i].index;
1024887Schin w->next = 0;
1034887Schin dtinsert(local.attrs, w);
1044887Schin }
1054887Schin for (i = 0; i < elementsof(ucs_names); i++, w++)
1064887Schin {
1074887Schin memcpy(w, &ucs_names[i], offsetof(Ucs_dat_t, table));
1084887Schin w->name = ucs_strings[ucs_names[i].table] + ucs_names[i].index;
1094887Schin w->next = 0;
1104887Schin if (a = (Ucs_map_t*)dtsearch(local.names, w))
1114887Schin {
1124887Schin while (a->next)
1134887Schin a = a->next;
1144887Schin a->next = w;
1154887Schin }
1164887Schin else
1174887Schin dtinsert(local.names, w);
1184887Schin }
1194887Schin #if CC_NATIVE != CC_ASCII
1204887Schin local.a2n = ccmap(CC_ASCII, CC_NATIVE);
1214887Schin #endif
1224887Schin return 0;
1234887Schin }
1244887Schin
1254887Schin /*
1264887Schin * return the collating symbol delimited by [c c], where c is either '=' or '.'
1274887Schin * s points to the first char after the initial [
1284887Schin * if e!=0 it is set to point to the next char in s on return
1294887Schin *
1304887Schin * the collating symbol is converted to multibyte in <buf,size>
1314887Schin * the return value is:
1324887Schin * -1 syntax error or buf not large enough
1334887Schin * >=0 size with 0-terminated mb collation element
1344887Schin * or ligature value in buf
1354887Schin */
1364887Schin
1374887Schin int
regcollate(register const char * s,char ** e,char * buf,int size)1384887Schin regcollate(register const char* s, char** e, char* buf, int size)
1394887Schin {
1404887Schin register int c;
1414887Schin register char* u;
1424887Schin register char* b;
1434887Schin register char* x;
1444887Schin register Ucs_map_t* a;
1454887Schin Ucs_map_t* z;
1464887Schin const char* t;
1474887Schin const char* v;
1484887Schin int n;
1494887Schin int r;
1504887Schin int ul;
1514887Schin int term;
1524887Schin wchar_t w[2];
1534887Schin Ucs_attr_t attr[3];
1544887Schin
1554887Schin if (size < 2)
1564887Schin r = -1;
1574887Schin else if ((term = *s++) != '.' && term != '=')
1584887Schin {
1594887Schin s--;
1604887Schin r = -1;
1614887Schin }
1624887Schin else if (*s == term && *(s + 1) == ']')
1634887Schin r = -1;
1644887Schin else
1654887Schin {
1664887Schin t = s;
1674887Schin mbchar(s);
1684887Schin if ((n = (s - t)) == 1)
1694887Schin {
1704887Schin if (*s == term && *(s + 1) == ']')
1714887Schin {
1724887Schin s += 2;
1734887Schin r = -1;
1744887Schin }
1754887Schin else
1764887Schin {
1774887Schin if (!local.attrs && initialize())
1784887Schin return -1;
1794887Schin attr[0] = attr[1] = attr[2] = 0;
1804887Schin ul = 0;
1814887Schin b = buf;
1824887Schin x = buf + size - 2;
1834887Schin r = 1;
1844887Schin s = t;
1854887Schin do
1864887Schin {
1874887Schin v = s;
1884887Schin u = b;
1894887Schin for (;;)
1904887Schin {
1914887Schin if (!(c = *s++))
1924887Schin return -1;
1934887Schin if (c == term)
1944887Schin {
1954887Schin if (!(c = *s++))
1964887Schin return -1;
1974887Schin if (c != term)
1984887Schin {
1994887Schin if (c != ']')
2004887Schin return -1;
2014887Schin r = -1;
2024887Schin break;
2034887Schin }
2044887Schin }
2054887Schin if (c == ' ' || c == '-' && u > b && *s != ' ' && *s != '-')
2064887Schin break;
2074887Schin if (isupper(c))
2084887Schin c = tolower(c);
2094887Schin if (u > x)
2104887Schin break;
2114887Schin *u++ = c;
2124887Schin }
2134887Schin *u = 0;
2144887Schin if (a = (Ucs_map_t*)dtmatch(local.attrs, b))
2154887Schin setattr(attr, a->code);
2164887Schin else
2174887Schin {
2184887Schin if (u < x)
2194887Schin *u++ = ' ';
2204887Schin if (b == buf)
2214887Schin {
2224887Schin if (isupper(*v))
2234887Schin ul = UCS_UC;
2244887Schin else if (islower(*v))
2254887Schin ul = UCS_LC;
2264887Schin }
2274887Schin b = u;
2284887Schin }
2294887Schin } while (r > 0);
2304887Schin if (b > buf && *(b - 1) == ' ')
2314887Schin b--;
2324887Schin *b = 0;
2334887Schin attr[0] &= ~((Ucs_attr_t)1);
2344887Schin if (ul)
2354887Schin {
2364887Schin if (tstattr(attr, UCS_UC) || tstattr(attr, UCS_LC))
2374887Schin ul = 0;
2384887Schin else
2394887Schin setattr(attr, ul);
2404887Schin }
2414887Schin if (z = (Ucs_map_t*)dtmatch(local.names, buf))
2424887Schin for(;;)
2434887Schin {
2444887Schin for (a = z; a; a = a->next)
2454887Schin if ((attr[0] & a->attr[0]) == attr[0] && (attr[1] & a->attr[1]) == attr[1] && (attr[2] & a->attr[2]) == attr[2])
2464887Schin {
2478462SApril.Chin@Sun.COM #if 0
2484887Schin if (a->code <= 0xff)
2494887Schin {
2504887Schin #if CC_NATIVE != CC_ASCII
2514887Schin buf[0] = local.a2n[a->code];
2524887Schin #else
2534887Schin buf[0] = a->code;
2544887Schin #endif
2554887Schin buf[r = 1] = 0;
2564887Schin ul = 0;
2574887Schin break;
2584887Schin }
2598462SApril.Chin@Sun.COM #endif
2604887Schin w[0] = a->code;
2614887Schin w[1] = 0;
2624887Schin if ((r = wcstombs(buf, w, size)) > 0)
2634887Schin ul = 0;
2644887Schin break;
2654887Schin }
2664887Schin if (!ul)
2674887Schin break;
2684887Schin clrattr(attr, ul);
2694887Schin ul = 0;
2704887Schin }
2714887Schin }
2724887Schin if (r < 0)
2734887Schin {
2748462SApril.Chin@Sun.COM if ((n = s - t - 2) > (size - 1))
2754887Schin return -1;
2768462SApril.Chin@Sun.COM memcpy(buf, t, n);
2778462SApril.Chin@Sun.COM buf[n] = 0;
2788462SApril.Chin@Sun.COM if (n == 1)
2798462SApril.Chin@Sun.COM r = n;
2808462SApril.Chin@Sun.COM else
2818462SApril.Chin@Sun.COM {
2828462SApril.Chin@Sun.COM for (t = buf; isalnum(*t); t++);
2838462SApril.Chin@Sun.COM if (!*t)
2848462SApril.Chin@Sun.COM r = n;
2858462SApril.Chin@Sun.COM }
2864887Schin }
2874887Schin }
2884887Schin else if (*s++ != term || *s++ != ']')
2894887Schin {
2904887Schin s--;
2914887Schin r = -1;
2924887Schin }
2934887Schin else if (n > (size - 1))
2944887Schin r = -1;
2954887Schin else
2964887Schin {
2974887Schin memcpy(buf, t, n);
2984887Schin buf[r = n] = 0;
2994887Schin }
3004887Schin }
3014887Schin if (e)
3024887Schin *e = (char*)s;
3034887Schin return r;
3044887Schin }
305