10Sstevel@tonic-gate /*
20Sstevel@tonic-gate * CDDL HEADER START
30Sstevel@tonic-gate *
40Sstevel@tonic-gate * The contents of this file are subject to the terms of the
54538Sdamico * Common Development and Distribution License (the "License").
64538Sdamico * You may not use this file except in compliance with the License.
70Sstevel@tonic-gate *
80Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
90Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing.
100Sstevel@tonic-gate * See the License for the specific language governing permissions
110Sstevel@tonic-gate * and limitations under the License.
120Sstevel@tonic-gate *
130Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each
140Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
150Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the
160Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying
170Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner]
180Sstevel@tonic-gate *
190Sstevel@tonic-gate * CDDL HEADER END
200Sstevel@tonic-gate */
210Sstevel@tonic-gate /*
22*6951Sab196087 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
230Sstevel@tonic-gate * Use is subject to license terms.
240Sstevel@tonic-gate */
250Sstevel@tonic-gate
260Sstevel@tonic-gate #pragma ident "%Z%%M% %I% %E% SMI"
270Sstevel@tonic-gate
280Sstevel@tonic-gate /*
290Sstevel@tonic-gate * sub3.c ... ALE enhancement.
300Sstevel@tonic-gate * Since a typical Asian language has a huge character set, it is not
310Sstevel@tonic-gate * ideal to index an array by a character code itself, which requires
320Sstevel@tonic-gate * as large as 2**16 entries per array.
330Sstevel@tonic-gate * To get arround this problem, we identify a set of characters that
340Sstevel@tonic-gate * causes the same transition on all states and call it character group.
350Sstevel@tonic-gate * Every character in a same character group has a unique number called
360Sstevel@tonic-gate * character group id. A function yycgid(c) maps the character c (in process
370Sstevel@tonic-gate * code) to the id. This mapping is determined by analyzing all regular
380Sstevel@tonic-gate * expressions in the lex program.
390Sstevel@tonic-gate *
400Sstevel@tonic-gate */
410Sstevel@tonic-gate #include <stdlib.h>
420Sstevel@tonic-gate #include <widec.h>
430Sstevel@tonic-gate #include <search.h>
444538Sdamico #include "ldefs.h"
450Sstevel@tonic-gate
460Sstevel@tonic-gate /*
470Sstevel@tonic-gate * "lchar" stands for linearized character. It is a variant of
480Sstevel@tonic-gate * process code. AT&T's 16-bit process code has a drawback in which
490Sstevel@tonic-gate * for three three process code C, D and E where C <= D <= E,
500Sstevel@tonic-gate * codeset(C)==codeset(E) does not mean codeset(D)==codeset(C).
510Sstevel@tonic-gate * In other words, four codesets alternates as the magnitude
520Sstevel@tonic-gate * of character increases.
530Sstevel@tonic-gate * The lchar representation holds this property:
540Sstevel@tonic-gate * If three lchar C', D' and E' have the relationship C' < D' < E' and
550Sstevel@tonic-gate * codeset(C') == codeset(E') then D' is guaranteed to belong to
560Sstevel@tonic-gate * the same codeset as C' and E'.
570Sstevel@tonic-gate * lchar is implemented as 32 bit entities and the function linearize()
580Sstevel@tonic-gate * that maps a wchar_t to lchar is defined below. There is no
590Sstevel@tonic-gate * reverse function for it though.
600Sstevel@tonic-gate * The 32-bit process code by AT&T, used only for Taiwanese version at the
610Sstevel@tonic-gate * time of wrting, has no such problem and we use it as it is.
620Sstevel@tonic-gate */
630Sstevel@tonic-gate
640Sstevel@tonic-gate lchar yycgidtbl[MAXNCG] = {
650Sstevel@tonic-gate 0, /* For ease of computation of the id. */
660Sstevel@tonic-gate '\n', /* Newline is always special because '.' exclude it. */
670Sstevel@tonic-gate 0x000000ff, /* The upper limit of codeset 0. */
680Sstevel@tonic-gate 0x20ffffff, /* The upper limit of codeset 2. */
690Sstevel@tonic-gate 0x40ffffff /* The upper limit of codeset 3. */
700Sstevel@tonic-gate /* 0x60ffffff The upper limit of codeset 1. */
710Sstevel@tonic-gate /* Above assumes the number of significant bits of wchar_t is <= 24. */
720Sstevel@tonic-gate };
730Sstevel@tonic-gate int ncgidtbl = 5; /* # elements in yycgidtbl. */
740Sstevel@tonic-gate int ncg; /* Should set to ncgidtbl*2; this is the largest value yycgid() */
750Sstevel@tonic-gate /* returns plus 1. */
760Sstevel@tonic-gate
770Sstevel@tonic-gate static void setsymbol(int i);
780Sstevel@tonic-gate
790Sstevel@tonic-gate /*
800Sstevel@tonic-gate * For given 16-bit wchar_t (See NOTE), lchar is computed as illustrated below:
810Sstevel@tonic-gate *
820Sstevel@tonic-gate * wc: axxxxxxbyyyyyyy
830Sstevel@tonic-gate *
840Sstevel@tonic-gate * returns: 0ab0000000000000axxxxxxxbyyyyyyy
850Sstevel@tonic-gate *
860Sstevel@tonic-gate * linearize() doesn't do any if compiled with 32-bit wchar_t, use of
870Sstevel@tonic-gate * which is flagged with LONG_WCHAR_T macro.
880Sstevel@tonic-gate * NOTE:
890Sstevel@tonic-gate * The implementation is highly depends on the process code representation.
900Sstevel@tonic-gate * This function should be modified when 32-bit process code is used.
910Sstevel@tonic-gate * There is no need to keep 'a' and 'b' bits in the lower half of lchar.
920Sstevel@tonic-gate * You can actually omit these and squeeze the xxxxxx part one bit right.
930Sstevel@tonic-gate * We don't do that here just in sake of speed.
940Sstevel@tonic-gate */
950Sstevel@tonic-gate lchar
linearize(wchar_t wc)960Sstevel@tonic-gate linearize(wchar_t wc)
970Sstevel@tonic-gate {
980Sstevel@tonic-gate #ifdef LONG_WCHAR_T
990Sstevel@tonic-gate return ((lchar)wc); /* Don't do anything. */
1000Sstevel@tonic-gate #else
1010Sstevel@tonic-gate
1020Sstevel@tonic-gate lchar prefix;
1030Sstevel@tonic-gate switch (wc&0x8080) {
1040Sstevel@tonic-gate case 0x0000: prefix = 0x00000000; break;
1050Sstevel@tonic-gate case 0x0080: prefix = 0x20000000; break;
1060Sstevel@tonic-gate case 0x8000: prefix = 0x40000000; break;
1070Sstevel@tonic-gate case 0x8080: prefix = 0x60000000; break;
1080Sstevel@tonic-gate }
1090Sstevel@tonic-gate return (prefix|wc);
1100Sstevel@tonic-gate #endif
1110Sstevel@tonic-gate }
1120Sstevel@tonic-gate
1130Sstevel@tonic-gate /* compare liniear characters pointed to by pc1 and pc2 */
1140Sstevel@tonic-gate int
cmplc(const void * arg1,const void * arg2)1150Sstevel@tonic-gate cmplc(const void *arg1, const void *arg2)
1160Sstevel@tonic-gate {
1170Sstevel@tonic-gate lchar *pc1 = (lchar *)arg1;
1180Sstevel@tonic-gate lchar *pc2 = (lchar *)arg2;
1190Sstevel@tonic-gate
1200Sstevel@tonic-gate if (*pc1 > *pc2)
1210Sstevel@tonic-gate return (1);
1220Sstevel@tonic-gate else if (*pc1 == *pc2)
1230Sstevel@tonic-gate return (0);
1240Sstevel@tonic-gate else
1250Sstevel@tonic-gate return (-1);
1260Sstevel@tonic-gate }
1270Sstevel@tonic-gate
1280Sstevel@tonic-gate void
remch(wchar_t c)1290Sstevel@tonic-gate remch(wchar_t c)
1300Sstevel@tonic-gate {
1310Sstevel@tonic-gate lchar lc = linearize(c);
132*6951Sab196087 size_t local_ncgidtbl;
1330Sstevel@tonic-gate
1340Sstevel@tonic-gate /*
1350Sstevel@tonic-gate * User-friendliness consideration:
1360Sstevel@tonic-gate * Make sure no EUC chars are used in reg. exp.
1370Sstevel@tonic-gate */
1380Sstevel@tonic-gate if (!handleeuc) {
1390Sstevel@tonic-gate if (!isascii(c))
1404538Sdamico if (iswprint(c))
1414538Sdamico warning(
1420Sstevel@tonic-gate "Non-ASCII character '%wc' in pattern; use -w or -e lex option.", c);
1434538Sdamico else warning(
1440Sstevel@tonic-gate "Non-ASCII character of value %#x in pattern; use -w or -e lex option.", c);
1450Sstevel@tonic-gate /* In any case, we don't need to construct ncgidtbl[]. */
1460Sstevel@tonic-gate return;
1470Sstevel@tonic-gate }
1480Sstevel@tonic-gate
149*6951Sab196087 /*
150*6951Sab196087 * lsearch wants ncgidtbl to be size_t, but it is int. Hence,
151*6951Sab196087 * the use of local_ncgidtbl to satisfy the calling interface.
152*6951Sab196087 */
153*6951Sab196087 local_ncgidtbl = ncgidtbl;
154*6951Sab196087 (void) lsearch(&lc, yycgidtbl,
155*6951Sab196087 &local_ncgidtbl, sizeof (lchar), cmplc);
156*6951Sab196087 ncgidtbl = (int)local_ncgidtbl;
1570Sstevel@tonic-gate }
1580Sstevel@tonic-gate
1590Sstevel@tonic-gate void
sortcgidtbl(void)1600Sstevel@tonic-gate sortcgidtbl(void)
1610Sstevel@tonic-gate {
1620Sstevel@tonic-gate if (!handleeuc)
1630Sstevel@tonic-gate return;
1640Sstevel@tonic-gate qsort(yycgidtbl, ncgidtbl, sizeof (lchar), cmplc);
1650Sstevel@tonic-gate }
1660Sstevel@tonic-gate
1670Sstevel@tonic-gate /*
1680Sstevel@tonic-gate * int yycgid(wchar_t c)
1690Sstevel@tonic-gate * Takes c and returns its character group id, determind by the
1700Sstevel@tonic-gate * following algorithm. The program also uses the binary search
1710Sstevel@tonic-gate * algorithm, generalized from Knuth (6.2.1) Algorithm B.
1720Sstevel@tonic-gate *
1730Sstevel@tonic-gate * This function computes the "character group id" based on
1740Sstevel@tonic-gate * a table yycgidtbl of which each lchar entry is pre-sorted
1750Sstevel@tonic-gate * in ascending sequence The number of valid entries is given
1760Sstevel@tonic-gate * by YYNCGIDTBL. There is no duplicate entries in yycgidtbl.
1770Sstevel@tonic-gate * const int YYNCGIDTBL;
1780Sstevel@tonic-gate * lchar yycgidtbl[YYNCGIDTBL];
1790Sstevel@tonic-gate *
1800Sstevel@tonic-gate * yycgidtbl[0] is guaranteed to have zero.
1810Sstevel@tonic-gate *
1820Sstevel@tonic-gate * For given c, yycgid(c) returns:
1830Sstevel@tonic-gate * 2*i iff yycgidtbl[i] == lc
1840Sstevel@tonic-gate * 2*i+1 iff yycgidtbl[i] < lc < yycgidtbl[i+1]
1850Sstevel@tonic-gate * YYNCGIDTBL*2-1
1860Sstevel@tonic-gate * iff yycgidtbl[YYNCGIDTBL-1] < lc
1870Sstevel@tonic-gate * where lc=linearize(c).
1880Sstevel@tonic-gate *
1890Sstevel@tonic-gate * Some interesting properties.:
1900Sstevel@tonic-gate * 1. For any c, 0 <= yycgid(c) <= 2*YYNCGIDTBL-1
1910Sstevel@tonic-gate * 2. yycgid(c) == 0 iff c == 0.
1920Sstevel@tonic-gate * 3. For any wchar_t c and d, if linearize(c) < linearize(d) then
1930Sstevel@tonic-gate * yycgid(c) <= yycgid(d).
1940Sstevel@tonic-gate * 4. For any wchar_t c and d, if yycgid(c) < yycgid(d) then
1950Sstevel@tonic-gate * linearize(c) < linearize(d).
1960Sstevel@tonic-gate */
1970Sstevel@tonic-gate #define YYNCGIDTBL ncgidtbl
1980Sstevel@tonic-gate
1990Sstevel@tonic-gate int
yycgid(wchar_t c)2000Sstevel@tonic-gate yycgid(wchar_t c)
2010Sstevel@tonic-gate {
2020Sstevel@tonic-gate int first = 0;
2030Sstevel@tonic-gate int last = YYNCGIDTBL - 1;
2040Sstevel@tonic-gate lchar lc;
2050Sstevel@tonic-gate
2060Sstevel@tonic-gate /*
2070Sstevel@tonic-gate * In ASCII compat. mode, each character forms a "group" and the
2080Sstevel@tonic-gate * group-id is itself...
2090Sstevel@tonic-gate */
2100Sstevel@tonic-gate if (!handleeuc)
2110Sstevel@tonic-gate return (c);
2120Sstevel@tonic-gate
2130Sstevel@tonic-gate lc = linearize(c);
2140Sstevel@tonic-gate
2150Sstevel@tonic-gate /* An exceptional case: yycgidtbl[YYNCGIDTBL-1] < lc */
2160Sstevel@tonic-gate if (yycgidtbl[YYNCGIDTBL - 1] < lc)
2170Sstevel@tonic-gate return (YYNCGIDTBL*2 - 1);
2180Sstevel@tonic-gate
2190Sstevel@tonic-gate while (last >= 0) {
2200Sstevel@tonic-gate int i = (first+last)/2;
2210Sstevel@tonic-gate if (lc == yycgidtbl[i])
2224538Sdamico return (2*i); /* lc exactly matches an element. */
2230Sstevel@tonic-gate else if (yycgidtbl[i] < lc) {
2244538Sdamico if (lc < yycgidtbl[i+1]) {
2254538Sdamico /* lc is in between two elements */
2264538Sdamico return (2*i+1);
2274538Sdamico }
2280Sstevel@tonic-gate else
2294538Sdamico first = i + 1;
2300Sstevel@tonic-gate } else
2314538Sdamico last = i - 1;
2320Sstevel@tonic-gate }
2330Sstevel@tonic-gate error(
2340Sstevel@tonic-gate "system error in yycgid():binary search failed for c=0x%04x\n", c);
2350Sstevel@tonic-gate return (0);
2360Sstevel@tonic-gate }
2370Sstevel@tonic-gate
2380Sstevel@tonic-gate /*
2390Sstevel@tonic-gate * repbycgid --- replaces each character in the parsing tree by its
2400Sstevel@tonic-gate * character group id. This, however, should be called even in
2410Sstevel@tonic-gate * the ASCII compat. mode to process DOT nodes and to call cclinter()
2420Sstevel@tonic-gate * for the DOT and CCL nodes.
2430Sstevel@tonic-gate */
2440Sstevel@tonic-gate void
repbycgid(void)2450Sstevel@tonic-gate repbycgid(void)
2460Sstevel@tonic-gate {
2470Sstevel@tonic-gate int i, c;
2480Sstevel@tonic-gate
2490Sstevel@tonic-gate for (i = 0; i < tptr; ++i) {
2500Sstevel@tonic-gate c = name[i];
2510Sstevel@tonic-gate if (!ISOPERATOR(c)) {
2520Sstevel@tonic-gate /* If not an operator, it must be a char. */
2534538Sdamico name[i] = yycgid((wchar_t)c); /* So replace it. */
2540Sstevel@tonic-gate #ifdef DEBUG
2554538Sdamico if (debug) {
2564538Sdamico printf("name[%d]:'%c'->%d;\n", i, c, name[i]);
2574538Sdamico }
2580Sstevel@tonic-gate #endif
2590Sstevel@tonic-gate } else if (c == RSTR) {
2600Sstevel@tonic-gate c = right[i];
2610Sstevel@tonic-gate right[i] = yycgid((wchar_t)c);
2620Sstevel@tonic-gate #ifdef DEBUG
2634538Sdamico if (debug) {
2644538Sdamico printf(
2654538Sdamico "name[%d].right:'%c'->%d;\n",
2664538Sdamico i, c, right[i]);
2674538Sdamico }
2680Sstevel@tonic-gate #endif
2690Sstevel@tonic-gate } else if ((c == RCCL) || (c == RNCCL)) {
2700Sstevel@tonic-gate CHR cc, *s;
2710Sstevel@tonic-gate int j;
2720Sstevel@tonic-gate CHR ccltoken[CCLSIZE];
2730Sstevel@tonic-gate CHR *ccp;
2740Sstevel@tonic-gate int m;
2750Sstevel@tonic-gate /*
2760Sstevel@tonic-gate * This node represetns a character class RE [ccccc]
2770Sstevel@tonic-gate * s points to the string of characters that forms
2780Sstevel@tonic-gate * the class and/or a special prefix notation
2790Sstevel@tonic-gate * <RANGE>XY which corresponds to the RE X-Y,
2800Sstevel@tonic-gate * characters in the range of X and Y. Here,
2810Sstevel@tonic-gate * X <= Y is guranteed.
2820Sstevel@tonic-gate * We transform these characters into a string
2830Sstevel@tonic-gate * of sorted character group ids.
2840Sstevel@tonic-gate *
2850Sstevel@tonic-gate * There is another mechanism of packing tables
2860Sstevel@tonic-gate * that is inherited from the ASCII lex. Call of
2870Sstevel@tonic-gate * cclinter() is required for this packing.
2880Sstevel@tonic-gate * This used to be done as yylex() reads the lex
2890Sstevel@tonic-gate * rules but we have to do this here because the
2900Sstevel@tonic-gate * transition table is made to work on the char-group
2910Sstevel@tonic-gate * ids and the mapping cannot be determined until
2920Sstevel@tonic-gate * the entire file is read.
2930Sstevel@tonic-gate */
2940Sstevel@tonic-gate #ifdef DEBUG
2950Sstevel@tonic-gate if (debug) {
2960Sstevel@tonic-gate printf("name[%d]:R[N]CCL of \"", i);
2970Sstevel@tonic-gate strpt(left[i]);
2980Sstevel@tonic-gate printf(" -> {");
2990Sstevel@tonic-gate }
3000Sstevel@tonic-gate #endif
3010Sstevel@tonic-gate /* Prepare symbol[] for cclinter(). */
3020Sstevel@tonic-gate for (j = 0; j < ncg; ++j)
3030Sstevel@tonic-gate symbol[j] = FALSE;
3040Sstevel@tonic-gate
3050Sstevel@tonic-gate s = (CHR *) left[i];
3060Sstevel@tonic-gate while (cc = *s++) {
3070Sstevel@tonic-gate if (cc == RANGE) {
3080Sstevel@tonic-gate int low, high, i;
3090Sstevel@tonic-gate /*
3100Sstevel@tonic-gate * Special form: <RANGE>XY
3110Sstevel@tonic-gate * This means the range X-Y.
3120Sstevel@tonic-gate * We mark all symbols[]
3130Sstevel@tonic-gate * elements for yycgid(X) thru
3140Sstevel@tonic-gate * yycgid(Y), inclusively.
3150Sstevel@tonic-gate */
3160Sstevel@tonic-gate low = yycgid(*s++);
3170Sstevel@tonic-gate high = yycgid(*s++);
3180Sstevel@tonic-gate for (i = low; i <= high; ++i)
3194538Sdamico setsymbol(i);
3200Sstevel@tonic-gate } else {
3210Sstevel@tonic-gate setsymbol(yycgid(cc));
3220Sstevel@tonic-gate }
3230Sstevel@tonic-gate }
3240Sstevel@tonic-gate
3250Sstevel@tonic-gate /* Now make a transformed string of cgids. */
3260Sstevel@tonic-gate s = ccptr;
3270Sstevel@tonic-gate m = 0;
3280Sstevel@tonic-gate for (j = 0; j < ncg; ++j)
3294538Sdamico if (symbol[j]) {
3304538Sdamico ccltoken[m++] = (CHR)j;
3310Sstevel@tonic-gate #ifdef DEBUG
3324538Sdamico if (debug) printf("%d, ", j);
3330Sstevel@tonic-gate #endif
3344538Sdamico }
3350Sstevel@tonic-gate
3360Sstevel@tonic-gate #ifdef DEBUG
3370Sstevel@tonic-gate if (debug) printf("}\n");
3380Sstevel@tonic-gate #endif
3390Sstevel@tonic-gate ccltoken[m] = 0;
3400Sstevel@tonic-gate ccp = ccl;
3410Sstevel@tonic-gate while (ccp < ccptr && scomp(ccltoken, ccp) != 0)
3420Sstevel@tonic-gate ccp++;
3430Sstevel@tonic-gate if (ccp < ccptr) { /* character class found in ccl */
3444538Sdamico left[i] = (int)ccp;
3450Sstevel@tonic-gate } else { /* not in ccl, add it */
3464538Sdamico left[i] = (int)ccptr;
3474538Sdamico scopy(ccltoken, ccptr);
3484538Sdamico ccptr += slength(ccltoken) + 1;
3494538Sdamico if (ccptr > ccl + CCLSIZE)
3504538Sdamico error(
3514538Sdamico "Too many large character classes");
3520Sstevel@tonic-gate }
3530Sstevel@tonic-gate cclinter(c == RCCL);
3540Sstevel@tonic-gate } else if (c == DOT) {
3550Sstevel@tonic-gate if (psave == 0) { /* First DOT node. */
3560Sstevel@tonic-gate int j, nlid;
3570Sstevel@tonic-gate /*
3580Sstevel@tonic-gate * Make symbol[k]=TRUE for all k
3590Sstevel@tonic-gate * except k == yycgid('\n').
3600Sstevel@tonic-gate */
3610Sstevel@tonic-gate nlid = yycgid('\n');
3620Sstevel@tonic-gate psave = ccptr;
3630Sstevel@tonic-gate for (j = 1; j < ncg; ++j) {
3640Sstevel@tonic-gate if (j == nlid) {
3650Sstevel@tonic-gate symbol[j] = FALSE;
3660Sstevel@tonic-gate } else {
3670Sstevel@tonic-gate symbol[j] = TRUE;
3680Sstevel@tonic-gate *ccptr++ = (CHR) j;
3690Sstevel@tonic-gate }
3700Sstevel@tonic-gate }
3710Sstevel@tonic-gate *ccptr++ = 0;
3720Sstevel@tonic-gate if (ccptr > ccl + CCLSIZE)
3734538Sdamico error(
3744538Sdamico "Too many large character classes");
3750Sstevel@tonic-gate }
3760Sstevel@tonic-gate /* Mimic mn1(RCCL,psave)... */
3770Sstevel@tonic-gate name[i] = RCCL;
3780Sstevel@tonic-gate left[i] = (int)psave;
3790Sstevel@tonic-gate cclinter(1);
3800Sstevel@tonic-gate }
3810Sstevel@tonic-gate }
3820Sstevel@tonic-gate #ifdef DEBUG
3830Sstevel@tonic-gate if (debug) {
3840Sstevel@tonic-gate printf("treedump after repbycgid().\n");
3850Sstevel@tonic-gate treedump();
3860Sstevel@tonic-gate }
3870Sstevel@tonic-gate #endif
3880Sstevel@tonic-gate }
3890Sstevel@tonic-gate
3900Sstevel@tonic-gate static void
setsymbol(int i)3910Sstevel@tonic-gate setsymbol(int i)
3920Sstevel@tonic-gate {
3930Sstevel@tonic-gate if (i > sizeof (symbol))
3940Sstevel@tonic-gate error("setsymbol: (SYSERR) %d out of range", i);
3950Sstevel@tonic-gate symbol[i] = TRUE;
3960Sstevel@tonic-gate }
397