1*8d25f761Sleot /* $NetBSD: str.c,v 1.30 2018/05/26 11:20:30 leot Exp $ */
2a1228f55Sjtc
39acecc03Sglass /*-
4a1228f55Sjtc * Copyright (c) 1991, 1993
5a1228f55Sjtc * The Regents of the University of California. All rights reserved.
69acecc03Sglass *
79acecc03Sglass * Redistribution and use in source and binary forms, with or without
89acecc03Sglass * modification, are permitted provided that the following conditions
99acecc03Sglass * are met:
109acecc03Sglass * 1. Redistributions of source code must retain the above copyright
119acecc03Sglass * notice, this list of conditions and the following disclaimer.
129acecc03Sglass * 2. Redistributions in binary form must reproduce the above copyright
139acecc03Sglass * notice, this list of conditions and the following disclaimer in the
149acecc03Sglass * documentation and/or other materials provided with the distribution.
1589aaa1bbSagc * 3. Neither the name of the University nor the names of its contributors
169acecc03Sglass * may be used to endorse or promote products derived from this software
179acecc03Sglass * without specific prior written permission.
189acecc03Sglass *
199acecc03Sglass * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
209acecc03Sglass * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
219acecc03Sglass * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
229acecc03Sglass * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
239acecc03Sglass * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
249acecc03Sglass * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
259acecc03Sglass * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
269acecc03Sglass * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
279acecc03Sglass * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
289acecc03Sglass * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
299acecc03Sglass * SUCH DAMAGE.
309acecc03Sglass */
319acecc03Sglass
327e4d6ea1Slukem #include <sys/cdefs.h>
339acecc03Sglass #ifndef lint
34a1228f55Sjtc #if 0
35f1df59adSjtc static char sccsid[] = "@(#)str.c 8.2 (Berkeley) 4/28/95";
36a1228f55Sjtc #endif
37*8d25f761Sleot __RCSID("$NetBSD: str.c,v 1.30 2018/05/26 11:20:30 leot Exp $");
389acecc03Sglass #endif /* not lint */
399acecc03Sglass
409acecc03Sglass #include <sys/types.h>
419acecc03Sglass
427e4d6ea1Slukem #include <err.h>
439acecc03Sglass #include <errno.h>
449acecc03Sglass #include <stddef.h>
459acecc03Sglass #include <stdio.h>
469acecc03Sglass #include <stdlib.h>
479acecc03Sglass #include <string.h>
48be9cb223Sjtc #include <ctype.h>
49d05d2aa5Sdholland #include <assert.h>
509acecc03Sglass
519acecc03Sglass #include "extern.h"
529acecc03Sglass
537f7f5495Sdholland struct str {
547f7f5495Sdholland enum { STRING1, STRING2 } which;
557f7f5495Sdholland enum { EOS, INFINITE, NORMAL, RANGE, SEQUENCE, SET } state;
567f7f5495Sdholland int cnt; /* character count */
577f7f5495Sdholland int lastch; /* last character */
587f7f5495Sdholland int equiv[2]; /* equivalence set */
597f7f5495Sdholland int *set; /* set of characters */
609f2ae0eaSdholland const char *str; /* user's string */
617f7f5495Sdholland };
627f7f5495Sdholland
631d578019Sjoerg static int backslash(STR *);
641d578019Sjoerg static int bracket(STR *);
651d578019Sjoerg static int c_class(const void *, const void *);
66aba78f75Sdholland static int *genclass(const char *, size_t);
671d578019Sjoerg static void genequiv(STR *);
681d578019Sjoerg static int genrange(STR *);
691d578019Sjoerg static void genseq(STR *);
709acecc03Sglass
717f7f5495Sdholland STR *
str_create(int whichstring,const char * txt)728b42fe33Sdholland str_create(int whichstring, const char *txt)
737f7f5495Sdholland {
747f7f5495Sdholland STR *s;
757f7f5495Sdholland
767f7f5495Sdholland s = malloc(sizeof(*s));
777f7f5495Sdholland if (s == NULL) {
787f7f5495Sdholland err(1, "Out of memory");
797f7f5495Sdholland }
807f7f5495Sdholland
817f7f5495Sdholland s->which = whichstring == 2 ? STRING2 : STRING1;
827f7f5495Sdholland s->state = NORMAL;
837f7f5495Sdholland s->cnt = 0;
847f7f5495Sdholland s->lastch = OOBCH;
857f7f5495Sdholland s->equiv[0] = 0;
867f7f5495Sdholland s->equiv[1] = OOBCH;
877f7f5495Sdholland s->set = NULL;
888b42fe33Sdholland s->str = txt;
897f7f5495Sdholland
907f7f5495Sdholland return s;
917f7f5495Sdholland }
927f7f5495Sdholland
937f7f5495Sdholland void
str_destroy(STR * s)947f7f5495Sdholland str_destroy(STR *s)
957f7f5495Sdholland {
967f7f5495Sdholland if (s->set != NULL && s->set != s->equiv) {
977f7f5495Sdholland free(s->set);
987f7f5495Sdholland }
997f7f5495Sdholland free(s);
1007f7f5495Sdholland }
1017f7f5495Sdholland
1029acecc03Sglass int
next(STR * s,int * ret)103d05d2aa5Sdholland next(STR *s, int *ret)
1049acecc03Sglass {
1057e4d6ea1Slukem int ch;
1069acecc03Sglass
1079acecc03Sglass switch (s->state) {
1089acecc03Sglass case EOS:
109d05d2aa5Sdholland *ret = s->lastch;
1100ffd30a0Schristos return 0;
1119acecc03Sglass case INFINITE:
112d05d2aa5Sdholland *ret = s->lastch;
1130ffd30a0Schristos return 1;
1149acecc03Sglass case NORMAL:
1155e8eff37Sdholland ch = (unsigned char)s->str[0];
1165e8eff37Sdholland switch (ch) {
1179acecc03Sglass case '\0':
1189acecc03Sglass s->state = EOS;
119d05d2aa5Sdholland *ret = s->lastch;
1200ffd30a0Schristos return 0;
1219acecc03Sglass case '\\':
1229acecc03Sglass s->lastch = backslash(s);
1239acecc03Sglass break;
1249acecc03Sglass case '[':
1255e8eff37Sdholland if (bracket(s)) {
126d05d2aa5Sdholland return next(s, ret);
1275e8eff37Sdholland }
1289acecc03Sglass /* FALLTHROUGH */
1299acecc03Sglass default:
1309acecc03Sglass ++s->str;
1319acecc03Sglass s->lastch = ch;
1329acecc03Sglass break;
1339acecc03Sglass }
1349acecc03Sglass
1359acecc03Sglass /* We can start a range at any time. */
136d05d2aa5Sdholland if (s->str[0] == '-' && genrange(s)) {
137d05d2aa5Sdholland return next(s, ret);
138d05d2aa5Sdholland }
139d05d2aa5Sdholland *ret = s->lastch;
1400ffd30a0Schristos return 1;
1419acecc03Sglass case RANGE:
1425e8eff37Sdholland if (s->cnt == 0) {
1439acecc03Sglass s->state = NORMAL;
144d05d2aa5Sdholland return next(s, ret);
1459acecc03Sglass }
1465e8eff37Sdholland s->cnt--;
1479acecc03Sglass ++s->lastch;
148d05d2aa5Sdholland *ret = s->lastch;
1490ffd30a0Schristos return 1;
1509acecc03Sglass case SEQUENCE:
1515e8eff37Sdholland if (s->cnt == 0) {
1529acecc03Sglass s->state = NORMAL;
153d05d2aa5Sdholland return next(s, ret);
1549acecc03Sglass }
1555e8eff37Sdholland s->cnt--;
156d05d2aa5Sdholland *ret = s->lastch;
1570ffd30a0Schristos return 1;
1589acecc03Sglass case SET:
1595e8eff37Sdholland s->lastch = s->set[s->cnt++];
1605e8eff37Sdholland if (s->lastch == OOBCH) {
1619acecc03Sglass s->state = NORMAL;
1625e8eff37Sdholland if (s->set != s->equiv) {
1635e8eff37Sdholland free(s->set);
1645e8eff37Sdholland }
1655e8eff37Sdholland s->set = NULL;
166d05d2aa5Sdholland return next(s, ret);
1679acecc03Sglass }
168d05d2aa5Sdholland *ret = s->lastch;
1690ffd30a0Schristos return 1;
1709acecc03Sglass }
1719acecc03Sglass /* NOTREACHED */
172d05d2aa5Sdholland assert(0);
173d05d2aa5Sdholland *ret = s->lastch;
1740ffd30a0Schristos return 0;
1759acecc03Sglass }
1769acecc03Sglass
1779acecc03Sglass static int
bracket(STR * s)1781d578019Sjoerg bracket(STR *s)
1799acecc03Sglass {
1805e8eff37Sdholland const char *p;
181aba78f75Sdholland int *q;
1829acecc03Sglass
1839acecc03Sglass switch (s->str[1]) {
1849acecc03Sglass case ':': /* "[:class:]" */
1859acecc03Sglass if ((p = strstr(s->str + 2, ":]")) == NULL)
1860ffd30a0Schristos return 0;
1879acecc03Sglass s->str += 2;
188aba78f75Sdholland q = genclass(s->str, p - s->str);
189aba78f75Sdholland s->state = SET;
190aba78f75Sdholland s->set = q;
191aba78f75Sdholland s->cnt = 0;
1929acecc03Sglass s->str = p + 2;
1930ffd30a0Schristos return 1;
1949acecc03Sglass case '=': /* "[=equiv=]" */
1959acecc03Sglass if ((p = strstr(s->str + 2, "=]")) == NULL)
1960ffd30a0Schristos return 0;
1979acecc03Sglass s->str += 2;
1989acecc03Sglass genequiv(s);
1998d0b47d1Sdholland s->str = p + 2;
2000ffd30a0Schristos return 1;
2019acecc03Sglass default: /* "[\###*n]" or "[#*n]" */
2029acecc03Sglass if ((p = strpbrk(s->str + 2, "*]")) == NULL)
2030ffd30a0Schristos return 0;
2047e4d6ea1Slukem if (p[0] != '*' || strchr(p, ']') == NULL)
2050ffd30a0Schristos return 0;
2069acecc03Sglass s->str += 1;
2079acecc03Sglass genseq(s);
2080ffd30a0Schristos return 1;
2099acecc03Sglass }
2109acecc03Sglass /* NOTREACHED */
2119acecc03Sglass }
2129acecc03Sglass
2139acecc03Sglass typedef struct {
2147acf67f4Slukem const char *name;
2151d578019Sjoerg int (*func)(int);
2169acecc03Sglass } CLASS;
2179acecc03Sglass
2180ffd30a0Schristos static const CLASS classes[] = {
2190ffd30a0Schristos { "alnum", isalnum },
2200ffd30a0Schristos { "alpha", isalpha },
2210ffd30a0Schristos { "blank", isblank },
2220ffd30a0Schristos { "cntrl", iscntrl },
2230ffd30a0Schristos { "digit", isdigit },
2240ffd30a0Schristos { "graph", isgraph },
2250ffd30a0Schristos { "lower", islower },
2260ffd30a0Schristos { "print", isprint },
2270ffd30a0Schristos { "punct", ispunct },
2280ffd30a0Schristos { "space", isspace },
2290ffd30a0Schristos { "upper", isupper },
2300ffd30a0Schristos { "xdigit", isxdigit },
2319acecc03Sglass };
2329acecc03Sglass
2335e8eff37Sdholland typedef struct {
2345e8eff37Sdholland const char *name;
2355e8eff37Sdholland size_t len;
2365e8eff37Sdholland } CLASSKEY;
2375e8eff37Sdholland
238aba78f75Sdholland static int *
genclass(const char * class,size_t len)239aba78f75Sdholland genclass(const char *class, size_t len)
2409acecc03Sglass {
2415e8eff37Sdholland int ch;
2420ffd30a0Schristos const CLASS *cp;
2435e8eff37Sdholland CLASSKEY key;
2449acecc03Sglass int *p;
2455e8eff37Sdholland unsigned pos, num;
2469acecc03Sglass
2475e8eff37Sdholland /* Find the class */
2485e8eff37Sdholland key.name = class;
2495e8eff37Sdholland key.len = len;
2505e8eff37Sdholland cp = bsearch(&key, classes, __arraycount(classes), sizeof(classes[0]),
2515e8eff37Sdholland c_class);
2525e8eff37Sdholland if (cp == NULL) {
2535e8eff37Sdholland errx(1, "unknown class %.*s", (int)len, class);
2545e8eff37Sdholland }
2559acecc03Sglass
2565e8eff37Sdholland /*
2575e8eff37Sdholland * Figure out what characters are in the class
2585e8eff37Sdholland */
2595e8eff37Sdholland
2605e8eff37Sdholland num = NCHARS + 1;
2615e8eff37Sdholland p = malloc(num * sizeof(*p));
2625e8eff37Sdholland if (p == NULL) {
2637e4d6ea1Slukem err(1, "malloc");
2645e8eff37Sdholland }
2656438f483Schristos
2665e8eff37Sdholland pos = 0;
2675e8eff37Sdholland for (ch = 0; ch < NCHARS; ch++) {
2685e8eff37Sdholland if (cp->func(ch)) {
2695e8eff37Sdholland p[pos++] = ch;
2705e8eff37Sdholland }
2715e8eff37Sdholland }
2729acecc03Sglass
2735e8eff37Sdholland p[pos++] = OOBCH;
2745e8eff37Sdholland for (; pos < num; pos++) {
2755e8eff37Sdholland p[pos] = 0;
2765e8eff37Sdholland }
2775e8eff37Sdholland
278aba78f75Sdholland return p;
2799acecc03Sglass }
2809acecc03Sglass
2819acecc03Sglass static int
c_class(const void * av,const void * bv)2825e8eff37Sdholland c_class(const void *av, const void *bv)
2839acecc03Sglass {
2845e8eff37Sdholland const CLASSKEY *a = av;
2855e8eff37Sdholland const CLASS *b = bv;
2865e8eff37Sdholland size_t blen;
2875e8eff37Sdholland int r;
2885e8eff37Sdholland
2895e8eff37Sdholland blen = strlen(b->name);
2905e8eff37Sdholland r = strncmp(a->name, b->name, a->len);
2915e8eff37Sdholland if (r != 0) {
2925e8eff37Sdholland return r;
2935e8eff37Sdholland }
2945e8eff37Sdholland if (a->len < blen) {
2955e8eff37Sdholland /* someone gave us a prefix of the right name */
2965e8eff37Sdholland return -1;
2975e8eff37Sdholland }
2985e8eff37Sdholland assert(a-> len == blen);
2995e8eff37Sdholland return 0;
3009acecc03Sglass }
3019acecc03Sglass
3029acecc03Sglass /*
3039acecc03Sglass * English doesn't have any equivalence classes, so for now
3049acecc03Sglass * we just syntax check and grab the character.
3059acecc03Sglass */
3069acecc03Sglass static void
genequiv(STR * s)3071d578019Sjoerg genequiv(STR *s)
3089acecc03Sglass {
309aba78f75Sdholland int ch;
310aba78f75Sdholland
311aba78f75Sdholland ch = (unsigned char)s->str[0];
312aba78f75Sdholland if (ch == '\\') {
3139acecc03Sglass s->equiv[0] = backslash(s);
3149acecc03Sglass } else {
315aba78f75Sdholland s->equiv[0] = ch;
3168d0b47d1Sdholland s->str++;
3178d0b47d1Sdholland }
3188d0b47d1Sdholland if (s->str[0] != '=') {
3195e8eff37Sdholland errx(1, "Misplaced equivalence equals sign");
3205e8eff37Sdholland }
3218d0b47d1Sdholland s->str++;
3228d0b47d1Sdholland if (s->str[0] != ']') {
3238d0b47d1Sdholland errx(1, "Misplaced equivalence right bracket");
3249acecc03Sglass }
3258d0b47d1Sdholland s->str++;
3268d0b47d1Sdholland
3279acecc03Sglass s->cnt = 0;
3289acecc03Sglass s->state = SET;
3299acecc03Sglass s->set = s->equiv;
3309acecc03Sglass }
3319acecc03Sglass
3329acecc03Sglass static int
genrange(STR * s)3331d578019Sjoerg genrange(STR *s)
3349acecc03Sglass {
3359acecc03Sglass int stopval;
33648dbbc10Sdholland const char *savestart;
3379acecc03Sglass
338d506ddf6Sdholland savestart = s->str++;
3395e8eff37Sdholland stopval = s->str[0] == '\\' ? backslash(s) : (unsigned char)*s->str++;
3405e8eff37Sdholland if (stopval < (unsigned char)s->lastch) {
3419acecc03Sglass s->str = savestart;
3420ffd30a0Schristos return 0;
3439acecc03Sglass }
3449acecc03Sglass s->cnt = stopval - s->lastch + 1;
3459acecc03Sglass s->state = RANGE;
3469acecc03Sglass --s->lastch;
3470ffd30a0Schristos return 1;
3489acecc03Sglass }
3499acecc03Sglass
3509acecc03Sglass static void
genseq(STR * s)3511d578019Sjoerg genseq(STR *s)
3529acecc03Sglass {
3539acecc03Sglass char *ep;
3549acecc03Sglass
3555e8eff37Sdholland if (s->which == STRING1) {
3565e8eff37Sdholland errx(1, "Sequences only valid in string2");
3575e8eff37Sdholland }
3589acecc03Sglass
3595e8eff37Sdholland if (*s->str == '\\') {
3609acecc03Sglass s->lastch = backslash(s);
3615e8eff37Sdholland } else {
3629f2ae0eaSdholland s->lastch = (unsigned char)*s->str++;
3635e8eff37Sdholland }
3645e8eff37Sdholland if (*s->str != '*') {
3655e8eff37Sdholland errx(1, "Misplaced sequence asterisk");
3665e8eff37Sdholland }
3679acecc03Sglass
3685e8eff37Sdholland s->str++;
3695e8eff37Sdholland switch (s->str[0]) {
3709acecc03Sglass case '\\':
3719acecc03Sglass s->cnt = backslash(s);
3729acecc03Sglass break;
3739acecc03Sglass case ']':
3749acecc03Sglass s->cnt = 0;
3759acecc03Sglass ++s->str;
3769acecc03Sglass break;
3779acecc03Sglass default:
3785e8eff37Sdholland if (isdigit((unsigned char)s->str[0])) {
3799acecc03Sglass s->cnt = strtol(s->str, &ep, 0);
3809acecc03Sglass if (*ep == ']') {
3819acecc03Sglass s->str = ep + 1;
3829acecc03Sglass break;
3839acecc03Sglass }
3849acecc03Sglass }
3857e4d6ea1Slukem errx(1, "illegal sequence count");
3869acecc03Sglass /* NOTREACHED */
3879acecc03Sglass }
3889acecc03Sglass
3899acecc03Sglass s->state = s->cnt ? SEQUENCE : INFINITE;
3909acecc03Sglass }
3919acecc03Sglass
3929acecc03Sglass /*
3939acecc03Sglass * Translate \??? into a character. Up to 3 octal digits, if no digits either
3949acecc03Sglass * an escape code or a literal character.
3959acecc03Sglass */
3969acecc03Sglass static int
backslash(STR * s)3971d578019Sjoerg backslash(STR *s)
3989acecc03Sglass {
3997e4d6ea1Slukem int ch, cnt, val;
4009acecc03Sglass
401aba78f75Sdholland cnt = val = 0;
402aba78f75Sdholland for (;;) {
403aba78f75Sdholland /* Consume the character we're already on. */
4045e8eff37Sdholland s->str++;
405aba78f75Sdholland
406aba78f75Sdholland /* Look at the next character. */
4075e8eff37Sdholland ch = (unsigned char)s->str[0];
4085e8eff37Sdholland if (!isascii(ch) || !isdigit(ch)) {
4099acecc03Sglass break;
4105e8eff37Sdholland }
4119acecc03Sglass val = val * 8 + ch - '0';
4129acecc03Sglass if (++cnt == 3) {
413aba78f75Sdholland /* Enough digits; consume this one and stop */
4149acecc03Sglass ++s->str;
4159acecc03Sglass break;
4169acecc03Sglass }
4179acecc03Sglass }
4185e8eff37Sdholland if (cnt) {
419aba78f75Sdholland /* We saw digits, so return their value */
420*8d25f761Sleot if (val >= OOBCH)
421*8d25f761Sleot errx(1, "Invalid octal character value");
4220ffd30a0Schristos return val;
4235e8eff37Sdholland }
424aba78f75Sdholland if (ch == '\0') {
425aba78f75Sdholland /* \<end> -> \ */
426aba78f75Sdholland s->state = EOS;
427aba78f75Sdholland return '\\';
4285e8eff37Sdholland }
429aba78f75Sdholland
430aba78f75Sdholland /* Consume the escaped character */
431aba78f75Sdholland s->str++;
432aba78f75Sdholland
4339acecc03Sglass switch (ch) {
4349acecc03Sglass case 'a': /* escape characters */
4350ffd30a0Schristos return '\7';
4369acecc03Sglass case 'b':
4370ffd30a0Schristos return '\b';
4380ffd30a0Schristos case 'e':
4390ffd30a0Schristos return '\033';
4409acecc03Sglass case 'f':
4410ffd30a0Schristos return '\f';
4429acecc03Sglass case 'n':
4430ffd30a0Schristos return '\n';
4449acecc03Sglass case 'r':
4450ffd30a0Schristos return '\r';
4469acecc03Sglass case 't':
4470ffd30a0Schristos return '\t';
4489acecc03Sglass case 'v':
4490ffd30a0Schristos return '\13';
450aba78f75Sdholland default: /* \q -> q */
4510ffd30a0Schristos return ch;
4529acecc03Sglass }
4539acecc03Sglass }
454