xref: /openbsd-src/usr.bin/rsync/rmatch.c (revision 936f2faf619cb018a3e4493785ed1ad166ca15d8)
1*936f2fafSclaudio /*	$OpenBSD: rmatch.c,v 1.4 2024/02/20 10:36:23 claudio Exp $	*/
257987d16Sclaudio 
357987d16Sclaudio /*
457987d16Sclaudio  * Copyright (c) 2021 Claudio Jeker <claudio@openbsd.org>
557987d16Sclaudio  *
657987d16Sclaudio  * Permission to use, copy, modify, and distribute this software for any
757987d16Sclaudio  * purpose with or without fee is hereby granted, provided that the above
857987d16Sclaudio  * copyright notice and this permission notice appear in all copies.
957987d16Sclaudio  *
1057987d16Sclaudio  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
1157987d16Sclaudio  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
1257987d16Sclaudio  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
1357987d16Sclaudio  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
1457987d16Sclaudio  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
1557987d16Sclaudio  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
1657987d16Sclaudio  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
1757987d16Sclaudio  */
1857987d16Sclaudio 
1957987d16Sclaudio /*
2057987d16Sclaudio  * Copyright (c) 1989, 1993, 1994
2157987d16Sclaudio  *	The Regents of the University of California.  All rights reserved.
2257987d16Sclaudio  *
2357987d16Sclaudio  * This code is derived from software contributed to Berkeley by
2457987d16Sclaudio  * Guido van Rossum.
2557987d16Sclaudio  *
2657987d16Sclaudio  * Redistribution and use in source and binary forms, with or without
2757987d16Sclaudio  * modification, are permitted provided that the following conditions
2857987d16Sclaudio  * are met:
2957987d16Sclaudio  * 1. Redistributions of source code must retain the above copyright
3057987d16Sclaudio  *    notice, this list of conditions and the following disclaimer.
3157987d16Sclaudio  * 2. Redistributions in binary form must reproduce the above copyright
3257987d16Sclaudio  *    notice, this list of conditions and the following disclaimer in the
3357987d16Sclaudio  *    documentation and/or other materials provided with the distribution.
3457987d16Sclaudio  * 3. Neither the name of the University nor the names of its contributors
3557987d16Sclaudio  *    may be used to endorse or promote products derived from this software
3657987d16Sclaudio  *    without specific prior written permission.
3757987d16Sclaudio  *
3857987d16Sclaudio  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
3957987d16Sclaudio  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
4057987d16Sclaudio  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
4157987d16Sclaudio  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
4257987d16Sclaudio  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
4357987d16Sclaudio  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
4457987d16Sclaudio  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
4557987d16Sclaudio  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
4657987d16Sclaudio  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
4757987d16Sclaudio  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
4857987d16Sclaudio  * SUCH DAMAGE.
4957987d16Sclaudio  */
5057987d16Sclaudio 
5157987d16Sclaudio #include <ctype.h>
5257987d16Sclaudio #include <stdio.h>
5357987d16Sclaudio #include <string.h>
5457987d16Sclaudio #include <limits.h>
5557987d16Sclaudio 
5657987d16Sclaudio #include "charclass.h"
57722539fdSclaudio #include "extern.h"
5857987d16Sclaudio 
5957987d16Sclaudio #define	RANGE_MATCH	1
6057987d16Sclaudio #define	RANGE_NOMATCH	0
6157987d16Sclaudio #define	RANGE_ERROR	(-1)
6257987d16Sclaudio 
6357987d16Sclaudio static int
classmatch(const char * pattern,char test,const char ** ep)6457987d16Sclaudio classmatch(const char *pattern, char test, const char **ep)
6557987d16Sclaudio {
6657987d16Sclaudio 	const char *mismatch = pattern;
6757987d16Sclaudio 	const struct cclass *cc;
6857987d16Sclaudio 	const char *colon;
6957987d16Sclaudio 	size_t len;
7057987d16Sclaudio 	int rval = RANGE_NOMATCH;
7157987d16Sclaudio 
7257987d16Sclaudio 	if (*pattern++ != ':') {
7357987d16Sclaudio 		*ep = mismatch;
7457987d16Sclaudio 		return RANGE_ERROR;
7557987d16Sclaudio 	}
7657987d16Sclaudio 	if ((colon = strchr(pattern, ':')) == NULL || colon[1] != ']') {
7757987d16Sclaudio 		*ep = mismatch;
7857987d16Sclaudio 		return RANGE_ERROR;
7957987d16Sclaudio 	}
8057987d16Sclaudio 	*ep = colon + 2;
8157987d16Sclaudio 	len = (size_t)(colon - pattern);
8257987d16Sclaudio 
8357987d16Sclaudio 	for (cc = cclasses; cc->name != NULL; cc++) {
8457987d16Sclaudio 		if (!strncmp(pattern, cc->name, len) && cc->name[len] == '\0') {
8557987d16Sclaudio 			if (cc->isctype((unsigned char)test))
8657987d16Sclaudio 				rval = RANGE_MATCH;
8757987d16Sclaudio 			return rval;
8857987d16Sclaudio 		}
8957987d16Sclaudio 	}
9057987d16Sclaudio 
9157987d16Sclaudio 	/* invalid character class, treat as normal text */
9257987d16Sclaudio 	*ep = mismatch;
9357987d16Sclaudio 	return RANGE_ERROR;
9457987d16Sclaudio }
9557987d16Sclaudio 
9657987d16Sclaudio static int
rangematch(const char ** pp,char test)9757987d16Sclaudio rangematch(const char **pp, char test)
9857987d16Sclaudio {
9957987d16Sclaudio 	const char *pattern = *pp;
10057987d16Sclaudio 	int negate, ok;
10157987d16Sclaudio 	char c, c2;
10257987d16Sclaudio 
10357987d16Sclaudio 	/*
10457987d16Sclaudio 	 * A bracket expression starting with an unquoted circumflex
10557987d16Sclaudio 	 * character produces unspecified results (IEEE 1003.2-1992,
10657987d16Sclaudio 	 * 3.13.2).  This implementation treats it like '!', for
10757987d16Sclaudio 	 * consistency with the regular expression syntax.
10857987d16Sclaudio 	 * J.T. Conklin (conklin@ngai.kaleida.com)
10957987d16Sclaudio 	 */
11057987d16Sclaudio 	if ((negate = (*pattern == '!' || *pattern == '^')))
11157987d16Sclaudio 		++pattern;
11257987d16Sclaudio 
11357987d16Sclaudio 	/*
11457987d16Sclaudio 	 * A right bracket shall lose its special meaning and represent
11557987d16Sclaudio 	 * itself in a bracket expression if it occurs first in the list.
11657987d16Sclaudio 	 * -- POSIX.2 2.8.3.2
11757987d16Sclaudio 	 */
11857987d16Sclaudio 	ok = 0;
11957987d16Sclaudio 	c = *pattern++;
12057987d16Sclaudio 	do {
12157987d16Sclaudio 		if (c == '[') {
12257987d16Sclaudio 			switch (classmatch(pattern, test, &pattern)) {
12357987d16Sclaudio 			case RANGE_MATCH:
12457987d16Sclaudio 				ok = 1;
12557987d16Sclaudio 				continue;
12657987d16Sclaudio 			case RANGE_NOMATCH:
12757987d16Sclaudio 				continue;
12857987d16Sclaudio 			default:
129d9a51c35Sjmc 				/* invalid character class, treat literally. */
13057987d16Sclaudio 				break;
13157987d16Sclaudio 			}
13257987d16Sclaudio 		}
13357987d16Sclaudio 		if (c == '\\')
13457987d16Sclaudio 			c = *pattern++;
13557987d16Sclaudio 		if (c == '\0')
13657987d16Sclaudio 			return RANGE_ERROR;
13757987d16Sclaudio 		/* patterns can not match on '/' */
13857987d16Sclaudio 		if (c == '/')
13957987d16Sclaudio 			return RANGE_NOMATCH;
14057987d16Sclaudio 		if (*pattern == '-'
14157987d16Sclaudio 		    && (c2 = *(pattern + 1)) != '\0' && c2 != ']') {
14257987d16Sclaudio 			pattern += 2;
14357987d16Sclaudio 			if (c2 == '\\')
14457987d16Sclaudio 				c2 = *pattern++;
14557987d16Sclaudio 			if (c2 == '\0')
14657987d16Sclaudio 				return RANGE_ERROR;
14757987d16Sclaudio 			if (c <= test && test <= c2)
14857987d16Sclaudio 				ok = 1;
14957987d16Sclaudio 		} else if (c == test)
15057987d16Sclaudio 			ok = 1;
15157987d16Sclaudio 	} while ((c = *pattern++) != ']');
15257987d16Sclaudio 
15357987d16Sclaudio 	*pp = pattern;
15457987d16Sclaudio 	return (ok == negate ? RANGE_NOMATCH : RANGE_MATCH);
15557987d16Sclaudio }
15657987d16Sclaudio 
15757987d16Sclaudio /*
15857987d16Sclaudio  * Single character match, advances pattern as much as needed.
159d9a51c35Sjmc  * Return 0 on match and !0 (aka 1) on mismatch.
16057987d16Sclaudio  * When matched pp is advanced to the end of the pattern matched.
16157987d16Sclaudio  */
16257987d16Sclaudio static int
matchchar(const char ** pp,const char in)16357987d16Sclaudio matchchar(const char **pp, const char in)
16457987d16Sclaudio {
16557987d16Sclaudio 	const char *pattern = *pp;
16657987d16Sclaudio 	char c;
16757987d16Sclaudio 	int rv = 0;
16857987d16Sclaudio 
16957987d16Sclaudio 	switch (c = *pattern++) {
17057987d16Sclaudio 	case '?':
17157987d16Sclaudio 		if (in == '\0')
17257987d16Sclaudio 			rv = 1;
17357987d16Sclaudio 		if (in == '/')
17457987d16Sclaudio 			rv = 1;
17557987d16Sclaudio 		break;
17657987d16Sclaudio 	case '[':
17757987d16Sclaudio 		if (in == '\0')
17857987d16Sclaudio 			rv = 1;
17957987d16Sclaudio 		if (in == '/')
18057987d16Sclaudio 			rv = 1;
18157987d16Sclaudio 		if (rv == 1)
18257987d16Sclaudio 			break;
18357987d16Sclaudio 
18457987d16Sclaudio 		switch (rangematch(&pattern, in)) {
18557987d16Sclaudio 		case RANGE_ERROR:
18657987d16Sclaudio 			/* not a good range, treat as normal text */
18757987d16Sclaudio 			goto normal;
18857987d16Sclaudio 		case RANGE_MATCH:
18957987d16Sclaudio 			break;
19057987d16Sclaudio 		case RANGE_NOMATCH:
19157987d16Sclaudio 			rv = 1;
19257987d16Sclaudio 		}
19357987d16Sclaudio 		break;
19457987d16Sclaudio 	case '\\':
19557987d16Sclaudio 		if ((c = *pattern++) == '\0') {
19657987d16Sclaudio 			c = '\\';
19757987d16Sclaudio 			--pattern;
19857987d16Sclaudio 		}
19957987d16Sclaudio 		/* FALLTHROUGH */
20057987d16Sclaudio 	default:
20157987d16Sclaudio 	normal:
20257987d16Sclaudio 		if (c != in)
20357987d16Sclaudio 			rv = 1;
20457987d16Sclaudio 		break;
20557987d16Sclaudio 	}
20657987d16Sclaudio 
20757987d16Sclaudio 	*pp = pattern;
20857987d16Sclaudio 	return rv;
20957987d16Sclaudio }
21057987d16Sclaudio 
21157987d16Sclaudio /*
21257987d16Sclaudio  * Do a substring match. If wild is set then the pattern started with a '*'.
21357987d16Sclaudio  * The match will go until '*', '/' or '\0' is encountered in pattern or
21457987d16Sclaudio  * the input string is consumed up to end.
21557987d16Sclaudio  * The pattern and string handles pp and ss are updated only on success.
21657987d16Sclaudio  */
21757987d16Sclaudio static int
matchsub(const char ** pp,const char ** ss,const char * end,int wild)21857987d16Sclaudio matchsub(const char **pp, const char **ss, const char *end, int wild)
21957987d16Sclaudio {
22057987d16Sclaudio 	const char *pattern = *pp;
22157987d16Sclaudio 	const char *p = pattern;
22257987d16Sclaudio 	const char *string = *ss;
22357987d16Sclaudio 	size_t matchlen;
22457987d16Sclaudio 
22557987d16Sclaudio 	/* first calculate how many characters the submatch will consume */
22657987d16Sclaudio 	for (matchlen = 0; *p != '\0'; matchlen++) {
22757987d16Sclaudio 		if (p[0] == '*')
22857987d16Sclaudio 			break;
22957987d16Sclaudio 		/* '/' acts as barrier */
23057987d16Sclaudio 		if (p[0] == '/' || (p[0] == '\\' && p[1] == '/')) {
23157987d16Sclaudio 			if (wild) {
23257987d16Sclaudio 				/* match needs to match up to end of segment */
23357987d16Sclaudio 				if (string > end - matchlen)
23457987d16Sclaudio 					return 1;
23557987d16Sclaudio 				string = end - matchlen;
23657987d16Sclaudio 				wild = 0;
23757987d16Sclaudio 			}
23857987d16Sclaudio 			break;
23957987d16Sclaudio 		}
24057987d16Sclaudio 		/*
24157987d16Sclaudio 		 * skip forward one character in pattern by doing a
24257987d16Sclaudio 		 * dummy lookup.
24357987d16Sclaudio 		 */
24457987d16Sclaudio 		matchchar(&p, ' ');
24557987d16Sclaudio 	}
24657987d16Sclaudio 
24757987d16Sclaudio 	/* not enough char to match */
24857987d16Sclaudio 	if (string > end - matchlen)
24957987d16Sclaudio 		return 1;
25057987d16Sclaudio 
25157987d16Sclaudio 	if (*p == '\0') {
25257987d16Sclaudio 		if (wild) {
25357987d16Sclaudio 			/* match needs to match up to end of segment */
25457987d16Sclaudio 			string = end - matchlen;
25557987d16Sclaudio 			wild = 0;
25657987d16Sclaudio 		}
25757987d16Sclaudio 	}
25857987d16Sclaudio 
25957987d16Sclaudio 	while (*pattern != '\0' && *pattern != '*') {
26057987d16Sclaudio 		/* eat possible escape char before '/' */
26157987d16Sclaudio 		if (pattern[0] == '\\' && pattern[1] == '/')
26257987d16Sclaudio 			pattern++;
263*936f2fafSclaudio 		if (pattern[0] == '/') {
264*936f2fafSclaudio 			/* hit the barrier but we still have characters left */
265*936f2fafSclaudio 			if (string < end)
266*936f2fafSclaudio 				return 1;
26757987d16Sclaudio 			break;
268*936f2fafSclaudio 		}
26957987d16Sclaudio 
27057987d16Sclaudio 		/* check if there are still characters available to compare */
27157987d16Sclaudio 		if (string >= end)
27257987d16Sclaudio 			return 1;
27357987d16Sclaudio 		/* Compare one char at a time. */
27457987d16Sclaudio 		if (!matchchar(&pattern, *string++))
27557987d16Sclaudio 			continue;
27657987d16Sclaudio 		if (wild) {
27757987d16Sclaudio 			/* skip forward one char and restart match */
27857987d16Sclaudio 			string = ++*ss;
27957987d16Sclaudio 			pattern = *pp;
28057987d16Sclaudio 			/* can it still match? */
28157987d16Sclaudio 			if (string > end - matchlen)
28257987d16Sclaudio 				return 1;
28357987d16Sclaudio 		} else {
28457987d16Sclaudio 			/* failed match */
28557987d16Sclaudio 			return 1;
28657987d16Sclaudio 		}
28757987d16Sclaudio 	}
28857987d16Sclaudio 
28957987d16Sclaudio 	*pp = pattern;
29057987d16Sclaudio 	*ss = string;
29157987d16Sclaudio 	return 0;
29257987d16Sclaudio }
29357987d16Sclaudio 
29457987d16Sclaudio /*
29557987d16Sclaudio  * File matching with the addition of the special '**'.
29657987d16Sclaudio  * Returns 0 on match and !0 for strings that do not match pattern.
29757987d16Sclaudio  */
29857987d16Sclaudio int
rmatch(const char * pattern,const char * string,int leading_dir)29957987d16Sclaudio rmatch(const char *pattern, const char *string, int leading_dir)
30057987d16Sclaudio {
30157987d16Sclaudio 	const char *segend, *segnext, *mismatch = NULL;
30257987d16Sclaudio 	int wild, starstar;
30357987d16Sclaudio 
30457987d16Sclaudio 	while (*pattern && *string) {
30557987d16Sclaudio 
30657987d16Sclaudio 		/* handle leading '/' first */
30757987d16Sclaudio 		if (pattern[0] == '\\' && pattern[1] == '/')
30857987d16Sclaudio 			pattern++;
30957987d16Sclaudio 		if (*string == '/' && *pattern == '/') {
31057987d16Sclaudio 			string++;
31157987d16Sclaudio 			pattern++;
31257987d16Sclaudio 		}
31357987d16Sclaudio 
31457987d16Sclaudio 		/* match to the next '/' in string */
31557987d16Sclaudio 		segend = strchr(string, '/');
31657987d16Sclaudio 		if (segend == NULL)
31757987d16Sclaudio 			segend = strchr(string, '\0');
31857987d16Sclaudio 
31957987d16Sclaudio 		while (*pattern) {
32057987d16Sclaudio 			/*
32157987d16Sclaudio 			 * Check for '*' and '**'. For '*' reduce '*' and '?'
32257987d16Sclaudio 			 * sequences into n-'?' and trailing '*'.
32357987d16Sclaudio 			 * For '**' this optimisation can not be done
32457987d16Sclaudio 			 * since '**???/' will match 'a/aa/aaa/' but not
32557987d16Sclaudio 			 * 'a/aa/aa/' still additional '*' will be reduced.
32657987d16Sclaudio 			 */
32757987d16Sclaudio 			wild = 0;
32857987d16Sclaudio 			starstar = 0;
32957987d16Sclaudio 			for ( ; *pattern == '*' || *pattern == '?'; pattern++) {
33057987d16Sclaudio 				if (pattern[0] == '*') {
33157987d16Sclaudio 					if (pattern[1] == '*') {
33257987d16Sclaudio 						starstar = 1;
33357987d16Sclaudio 						pattern++;
33457987d16Sclaudio 					}
33557987d16Sclaudio 					wild = 1;
33657987d16Sclaudio 				} else if (!starstar) {	/* pattern[0] == '?' */
33757987d16Sclaudio 					if (string < segend && *string != '/')
33857987d16Sclaudio 						string++;
33957987d16Sclaudio 					else
34057987d16Sclaudio 						/* no match possible */
34157987d16Sclaudio 						return 1;
34257987d16Sclaudio 				} else
34357987d16Sclaudio 					break;
34457987d16Sclaudio 			}
34557987d16Sclaudio 
34657987d16Sclaudio 			/* pattern ends in '**' so it is a match */
34757987d16Sclaudio 			if (starstar && *pattern == '\0')
34857987d16Sclaudio 				return 0;
34957987d16Sclaudio 
35057987d16Sclaudio 			if (starstar) {
35157987d16Sclaudio 				segnext = segend;
35257987d16Sclaudio 				mismatch = pattern;
35357987d16Sclaudio 			}
35457987d16Sclaudio 
35557987d16Sclaudio 			while (string < segend) {
35657987d16Sclaudio 				if (matchsub(&pattern, &string, segend, wild)) {
35757987d16Sclaudio failed_match:
35857987d16Sclaudio 					/*
35957987d16Sclaudio 					 * failed to match, if starstar retry
36057987d16Sclaudio 					 * with the next segment.
36157987d16Sclaudio 					 */
36257987d16Sclaudio 					if (mismatch) {
36357987d16Sclaudio 						pattern = mismatch;
36457987d16Sclaudio 						wild = 1;
36557987d16Sclaudio 						string = segnext;
36657987d16Sclaudio 						if (*string == '/')
36757987d16Sclaudio 							string++;
36857987d16Sclaudio 						segend = strchr(string, '/');
36957987d16Sclaudio 						if (!segend)
37057987d16Sclaudio 							segend = strchr(string,
37157987d16Sclaudio 							    '\0');
37257987d16Sclaudio 						segnext = segend;
37357987d16Sclaudio 						if (string < segend)
37457987d16Sclaudio 							continue;
37557987d16Sclaudio 					}
37657987d16Sclaudio 					/* no match possible */
37757987d16Sclaudio 					return 1;
37857987d16Sclaudio 				}
37957987d16Sclaudio 				break;
38057987d16Sclaudio 			}
38157987d16Sclaudio 
38257987d16Sclaudio 			/* at end of string segment, eat up any extra '*' */
38357987d16Sclaudio 			if (string >= segend && *pattern != '*')
38457987d16Sclaudio 				break;
38557987d16Sclaudio 		}
38657987d16Sclaudio 		if (*string != '\0' && *string != '/')
38757987d16Sclaudio 			goto failed_match;
38857987d16Sclaudio 		if (*pattern != '\0' && *pattern != '/')
38957987d16Sclaudio 			goto failed_match;
39057987d16Sclaudio 	}
39157987d16Sclaudio 
39257987d16Sclaudio 	/* if both pattern and string are consumed it was a match */
39357987d16Sclaudio 	if (*pattern == '\0' && *string == '\0')
39457987d16Sclaudio 		return 0;
39557987d16Sclaudio 	/* if leading_dir is set then string can also be '/' for success */
39657987d16Sclaudio 	if (leading_dir && *pattern == '\0' && *string == '/')
39757987d16Sclaudio 		return 0;
39857987d16Sclaudio 	/* else failure */
39957987d16Sclaudio 	return 1;
40057987d16Sclaudio }
401