1*51396Sbostic /*- 2*51396Sbostic * Copyright (c) 1991 The Regents of the University of California. 3*51396Sbostic * All rights reserved. 4*51396Sbostic * 5*51396Sbostic * %sccs.include.redist.c% 6*51396Sbostic */ 7*51396Sbostic 8*51396Sbostic #ifndef lint 9*51396Sbostic static char sccsid[] = "@(#)str.c 5.1 (Berkeley) 10/24/91"; 10*51396Sbostic #endif /* not lint */ 11*51396Sbostic 12*51396Sbostic #include <sys/cdefs.h> 13*51396Sbostic #include <sys/types.h> 14*51396Sbostic #include <errno.h> 15*51396Sbostic #include <stdio.h> 16*51396Sbostic #include <stddef.h> 17*51396Sbostic #include <stdlib.h> 18*51396Sbostic #include <string.h> 19*51396Sbostic #include "extern.h" 20*51396Sbostic 21*51396Sbostic static int backslash __P((STR *)); 22*51396Sbostic static int bracket __P((STR *)); 23*51396Sbostic static int c_class __P((const void *, const void *)); 24*51396Sbostic static void genclass __P((STR *)); 25*51396Sbostic static void genequiv __P((STR *)); 26*51396Sbostic static int genrange __P((STR *)); 27*51396Sbostic static void genseq __P((STR *)); 28*51396Sbostic 29*51396Sbostic int 30*51396Sbostic next(s) 31*51396Sbostic register STR *s; 32*51396Sbostic { 33*51396Sbostic register int ch; 34*51396Sbostic 35*51396Sbostic switch (s->state) { 36*51396Sbostic case EOS: 37*51396Sbostic return (0); 38*51396Sbostic case INFINITE: 39*51396Sbostic return (1); 40*51396Sbostic case NORMAL: 41*51396Sbostic switch (ch = *s->str++) { 42*51396Sbostic case '\0': 43*51396Sbostic --s->str; 44*51396Sbostic s->state = EOS; 45*51396Sbostic return (0); 46*51396Sbostic case '\\': 47*51396Sbostic s->lastch = backslash(s); 48*51396Sbostic break; 49*51396Sbostic case '[': 50*51396Sbostic if (bracket(s)) 51*51396Sbostic return (next(s)); 52*51396Sbostic /* FALLTHROUGH */ 53*51396Sbostic default: 54*51396Sbostic s->lastch = ch; 55*51396Sbostic break; 56*51396Sbostic } 57*51396Sbostic 58*51396Sbostic /* We can start a range at any time. */ 59*51396Sbostic if (s->str[0] == '-' && genrange(s)) 60*51396Sbostic return (next(s)); 61*51396Sbostic return (1); 62*51396Sbostic case RANGE: 63*51396Sbostic if (s->cnt-- == 0) { 64*51396Sbostic s->state = NORMAL; 65*51396Sbostic return (next(s)); 66*51396Sbostic } 67*51396Sbostic ++s->lastch; 68*51396Sbostic return (1); 69*51396Sbostic case SEQUENCE: 70*51396Sbostic if (s->cnt-- == 0) { 71*51396Sbostic s->state = NORMAL; 72*51396Sbostic return (next(s)); 73*51396Sbostic } 74*51396Sbostic return (1); 75*51396Sbostic case SET: 76*51396Sbostic case ULSET: 77*51396Sbostic if ((s->lastch = s->set[s->cnt++]) == OOBCH) { 78*51396Sbostic s->state = NORMAL; 79*51396Sbostic return (next(s)); 80*51396Sbostic } 81*51396Sbostic return (1); 82*51396Sbostic } 83*51396Sbostic /* NOTREACHED */ 84*51396Sbostic } 85*51396Sbostic 86*51396Sbostic static int 87*51396Sbostic bracket(s) 88*51396Sbostic register STR *s; 89*51396Sbostic { 90*51396Sbostic register char *p; 91*51396Sbostic 92*51396Sbostic switch (*s->str) { 93*51396Sbostic case ':': /* "[:class:]" */ 94*51396Sbostic if ((p = strpbrk(s->str + 1, ":]")) == NULL) 95*51396Sbostic return (0); 96*51396Sbostic if (p[0] != ':' || p[1] != ']') 97*51396Sbostic return (0); 98*51396Sbostic *p = '\0'; 99*51396Sbostic ++s->str; 100*51396Sbostic genclass(s); 101*51396Sbostic s->str = p + 2; 102*51396Sbostic return (1); 103*51396Sbostic case '=': /* "[=equiv=]" */ 104*51396Sbostic if ((p = strpbrk(s->str + 1, "=]")) == NULL) 105*51396Sbostic return (0); 106*51396Sbostic if (p[0] != '=' || p[1] != ']') 107*51396Sbostic return (0); 108*51396Sbostic genequiv(s); 109*51396Sbostic return (1); 110*51396Sbostic default: /* "[\###*]" or "[#*]" */ 111*51396Sbostic if ((p = strpbrk(s->str + 1, "*]")) == NULL) 112*51396Sbostic return (0); 113*51396Sbostic if (p[0] != '*' || index(p, ']') == NULL) 114*51396Sbostic return (0); 115*51396Sbostic genseq(s); 116*51396Sbostic return (1); 117*51396Sbostic } 118*51396Sbostic /* NOTREACHED */ 119*51396Sbostic } 120*51396Sbostic 121*51396Sbostic int isalnum __P((int)), 122*51396Sbostic isalpha __P((int)), 123*51396Sbostic isblank __P((int)), 124*51396Sbostic isspace __P((int)), 125*51396Sbostic iscntrl __P((int)), 126*51396Sbostic isdigit __P((int)), 127*51396Sbostic isgraph __P((int)), 128*51396Sbostic islower __P((int)), 129*51396Sbostic isprint __P((int)), 130*51396Sbostic ispunct __P((int)), 131*51396Sbostic isupper __P((int)), 132*51396Sbostic isxdigit __P((int)); 133*51396Sbostic 134*51396Sbostic typedef struct { 135*51396Sbostic char *name; 136*51396Sbostic int (*func) __P((int)); 137*51396Sbostic u_int type; 138*51396Sbostic int *set; 139*51396Sbostic } CLASS; 140*51396Sbostic 141*51396Sbostic static CLASS classes[] = { 142*51396Sbostic { "alnum", isalnum, T_CLASS, }, 143*51396Sbostic { "alpha", isalpha, T_CLASS, }, 144*51396Sbostic { "blank", isblank, T_CLASS, }, 145*51396Sbostic { "cntrl", iscntrl, T_CLASS, }, 146*51396Sbostic { "digit", isdigit, T_CLASS, }, 147*51396Sbostic { "graph", isgraph, T_CLASS, }, 148*51396Sbostic { "lower", islower, T_UL, }, 149*51396Sbostic { "print", isupper, T_CLASS, }, 150*51396Sbostic { "punct", ispunct, T_CLASS, }, 151*51396Sbostic { "space", isspace, T_CLASS, }, 152*51396Sbostic { "upper", isupper, T_UL, }, 153*51396Sbostic { "xdigit", isxdigit, T_CLASS, }, 154*51396Sbostic }; 155*51396Sbostic 156*51396Sbostic static void 157*51396Sbostic genclass(s) 158*51396Sbostic STR *s; 159*51396Sbostic { 160*51396Sbostic register int cnt, (*func) __P((int)); 161*51396Sbostic CLASS *cp, tmp; 162*51396Sbostic int *p; 163*51396Sbostic 164*51396Sbostic tmp.name = s->str; 165*51396Sbostic if ((cp = (CLASS *)bsearch(&tmp, classes, sizeof(classes) / 166*51396Sbostic sizeof(CLASS), sizeof(CLASS), c_class)) == NULL) 167*51396Sbostic err("unknown class %s", s->str); 168*51396Sbostic if (!(cp->type | s->type)) 169*51396Sbostic err("class %s illegally used"); 170*51396Sbostic 171*51396Sbostic if ((cp->set = p = malloc((NCHARS + 1) * sizeof(int))) == NULL) 172*51396Sbostic err("%s", strerror(errno)); 173*51396Sbostic bzero(p, NCHARS); 174*51396Sbostic for (cnt = 0, func = cp->func; cnt < NCHARS; ++cnt) 175*51396Sbostic if ((func)(cnt)) 176*51396Sbostic *p++ = cnt; 177*51396Sbostic *p = OOBCH; 178*51396Sbostic 179*51396Sbostic s->cnt = 0; 180*51396Sbostic s->state = cp->type & T_UL ? ULSET : SET; 181*51396Sbostic s->set = cp->set; 182*51396Sbostic } 183*51396Sbostic 184*51396Sbostic static int 185*51396Sbostic c_class(a, b) 186*51396Sbostic const void *a, *b; 187*51396Sbostic { 188*51396Sbostic return (strcmp(((CLASS *)a)->name, ((CLASS *)b)->name)); 189*51396Sbostic } 190*51396Sbostic 191*51396Sbostic /* 192*51396Sbostic * English doesn't have any equivalence classes, so for now 193*51396Sbostic * we just syntax check and grab the character. 194*51396Sbostic */ 195*51396Sbostic static void 196*51396Sbostic genequiv(s) 197*51396Sbostic STR *s; 198*51396Sbostic { 199*51396Sbostic static int val[2] = { 0, OOBCH }; 200*51396Sbostic 201*51396Sbostic if (*++s->str == '\\') { 202*51396Sbostic val[0] = backslash(s); 203*51396Sbostic if (*s->str != '=') 204*51396Sbostic err("misplaced equivalence equals sign"); 205*51396Sbostic } else { 206*51396Sbostic val[0] = s->str[0]; 207*51396Sbostic if (s->str[1] != '=') 208*51396Sbostic err("misplaced equivalence equals sign"); 209*51396Sbostic } 210*51396Sbostic s->str += 2; 211*51396Sbostic s->cnt = 0; 212*51396Sbostic s->state = SET; 213*51396Sbostic s->set = val; 214*51396Sbostic } 215*51396Sbostic 216*51396Sbostic static int 217*51396Sbostic genrange(s) 218*51396Sbostic STR *s; 219*51396Sbostic { 220*51396Sbostic int stopval; 221*51396Sbostic char *savestart; 222*51396Sbostic 223*51396Sbostic savestart = s->str; 224*51396Sbostic stopval = *++s->str == '\\' ? backslash(s) : *s->str; 225*51396Sbostic if (stopval < s->lastch) { 226*51396Sbostic s->str = savestart; 227*51396Sbostic return (0); 228*51396Sbostic } 229*51396Sbostic s->cnt = stopval - s->lastch + 1; 230*51396Sbostic s->state = RANGE; 231*51396Sbostic --s->lastch; 232*51396Sbostic return (1); 233*51396Sbostic } 234*51396Sbostic 235*51396Sbostic static void 236*51396Sbostic genseq(s) 237*51396Sbostic STR *s; 238*51396Sbostic { 239*51396Sbostic char *ep; 240*51396Sbostic 241*51396Sbostic if (!(s->type & T_SEQ)) 242*51396Sbostic err("sequences only valid in string1"); 243*51396Sbostic 244*51396Sbostic if (*s->str == '\\') 245*51396Sbostic s->lastch = backslash(s); 246*51396Sbostic else 247*51396Sbostic s->lastch = *s->str++; 248*51396Sbostic if (*s->str != '*') 249*51396Sbostic err("misplaced sequence asterisk"); 250*51396Sbostic 251*51396Sbostic switch (*++s->str) { 252*51396Sbostic case '\\': 253*51396Sbostic s->cnt = backslash(s); 254*51396Sbostic break; 255*51396Sbostic case ']': 256*51396Sbostic s->cnt = 0; 257*51396Sbostic ++s->str; 258*51396Sbostic break; 259*51396Sbostic default: 260*51396Sbostic if (isdigit(*s->str)) { 261*51396Sbostic s->cnt = strtol(s->str, &ep, 0); 262*51396Sbostic if (*ep == ']') { 263*51396Sbostic s->str = ep + 1; 264*51396Sbostic break; 265*51396Sbostic } 266*51396Sbostic } 267*51396Sbostic err("illegal sequence count"); 268*51396Sbostic /* NOTREACHED */ 269*51396Sbostic } 270*51396Sbostic 271*51396Sbostic s->state = s->cnt ? SEQUENCE : INFINITE; 272*51396Sbostic } 273*51396Sbostic 274*51396Sbostic /* Use the #defines here, DON'T use them above. */ 275*51396Sbostic #include <ctype.h> 276*51396Sbostic 277*51396Sbostic /* 278*51396Sbostic * Translate \??? into a character. Up to 3 octal digits, if no digits either 279*51396Sbostic * an escape code or a literal character. 280*51396Sbostic */ 281*51396Sbostic static int 282*51396Sbostic backslash(s) 283*51396Sbostic register STR *s; 284*51396Sbostic { 285*51396Sbostic register int ch, cnt, val; 286*51396Sbostic 287*51396Sbostic for (cnt = val = 0;;) { 288*51396Sbostic ch = *++s->str; 289*51396Sbostic if (!isascii(ch) || !isdigit(ch)) 290*51396Sbostic break; 291*51396Sbostic val = val * 8 + ch - '0'; 292*51396Sbostic if (++cnt == 3) 293*51396Sbostic break; 294*51396Sbostic } 295*51396Sbostic if (cnt) 296*51396Sbostic return (val); 297*51396Sbostic ++s->str; 298*51396Sbostic switch (ch) { 299*51396Sbostic case 'a': /* escape characters */ 300*51396Sbostic return ('\7'); 301*51396Sbostic case 'b': 302*51396Sbostic return ('\b'); 303*51396Sbostic case 'f': 304*51396Sbostic return ('\f'); 305*51396Sbostic case 'n': 306*51396Sbostic return ('\n'); 307*51396Sbostic case 'r': 308*51396Sbostic return ('\r'); 309*51396Sbostic case 't': 310*51396Sbostic return ('\t'); 311*51396Sbostic case 'v': 312*51396Sbostic return ('\13'); 313*51396Sbostic case '\0': /* \" -> \ */ 314*51396Sbostic s->state = EOS; 315*51396Sbostic return ('\\'); 316*51396Sbostic default: /* \x" -> x */ 317*51396Sbostic return (ch); 318*51396Sbostic } 319*51396Sbostic } 320