xref: /netbsd-src/lib/libcompat/regexp/regexp.3 (revision 01869ca4d24a86379a68731bf9706a9f0820fe4e)
1dd267b8cScgd.\" Copyright (c) 1991, 1993
2dd267b8cScgd.\"	The Regents of the University of California.  All rights reserved.
361f28255Scgd.\"
461f28255Scgd.\" Redistribution and use in source and binary forms, with or without
561f28255Scgd.\" modification, are permitted provided that the following conditions
661f28255Scgd.\" are met:
761f28255Scgd.\" 1. Redistributions of source code must retain the above copyright
861f28255Scgd.\"    notice, this list of conditions and the following disclaimer.
961f28255Scgd.\" 2. Redistributions in binary form must reproduce the above copyright
1061f28255Scgd.\"    notice, this list of conditions and the following disclaimer in the
1161f28255Scgd.\"    documentation and/or other materials provided with the distribution.
12eb7c1594Sagc.\" 3. Neither the name of the University nor the names of its contributors
1361f28255Scgd.\"    may be used to endorse or promote products derived from this software
1461f28255Scgd.\"    without specific prior written permission.
1561f28255Scgd.\"
1661f28255Scgd.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
1761f28255Scgd.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
1861f28255Scgd.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
1961f28255Scgd.\" ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
2061f28255Scgd.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
2161f28255Scgd.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
2261f28255Scgd.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
2361f28255Scgd.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
2461f28255Scgd.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
2561f28255Scgd.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
2661f28255Scgd.\" SUCH DAMAGE.
2761f28255Scgd.\"
28e37aec67Slukem.\"     from: @(#)regexp.3	8.1 (Berkeley) 6/4/93
29*01869ca4Swiz.\"	$NetBSD: regexp.3,v 1.18 2017/07/03 21:32:50 wiz Exp $
3061f28255Scgd.\"
31dd267b8cScgd.Dd June 4, 1993
3261f28255Scgd.Dt REGEXP 3
3361f28255Scgd.Os
3461f28255Scgd.Sh NAME
3561f28255Scgd.Nm regcomp ,
3661f28255Scgd.Nm regexec ,
3761f28255Scgd.Nm regsub ,
3861f28255Scgd.Nm regerror
397318d748Sapb.Nd obsolete "'regexp'" regular expression handlers
40312aca53Sperry.Sh LIBRARY
41312aca53Sperry.Lb libcompat
4261f28255Scgd.Sh SYNOPSIS
43472351e1Swiz.In regexp.h
4461f28255Scgd.Ft regexp *
4561f28255Scgd.Fn regcomp "const char *exp"
4661f28255Scgd.Ft int
4761f28255Scgd.Fn regexec "const regexp *prog" "const char *string"
4861f28255Scgd.Ft void
4961f28255Scgd.Fn regsub "const regexp *prog" "const char *source" "char *dest"
505a0d65cbSkleink.Ft void
515a0d65cbSkleink.Fn regerror "const char *msg"
5261f28255Scgd.Sh DESCRIPTION
53edcbc4e6Sjtc.Bf -symbolic
54dd267b8cScgdThis interface is made obsolete by
55dd267b8cScgd.Xr regex 3 .
56edcbc4e6SjtcIt is available from the compatibility library, libcompat.
57edcbc4e6Sjtc.Ef
58dd267b8cScgd.Pp
5961f28255ScgdThe
6061f28255Scgd.Fn regcomp ,
6161f28255Scgd.Fn regexec ,
6261f28255Scgd.Fn regsub ,
6361f28255Scgdand
6461f28255Scgd.Fn regerror
656b57aa9fSfairfunctions implement
6661f28255Scgd.Xr egrep 1 Ns -style
6761f28255Scgdregular expressions and supporting facilities.
6861f28255Scgd.Pp
6961f28255ScgdThe
7061f28255Scgd.Fn regcomp
7161f28255Scgdfunction
7261f28255Scgdcompiles a regular expression into a structure of type
736b57aa9fSfair.Em regexp ,
7461f28255Scgdand returns a pointer to it.
7561f28255ScgdThe space has been allocated using
7661f28255Scgd.Xr malloc 3
7761f28255Scgdand may be released by
786b57aa9fSfair.Xr free 3 .
7961f28255Scgd.Pp
8061f28255ScgdThe
8161f28255Scgd.Fn regexec
8261f28255Scgdfunction
8361f28255Scgdmatches a
8461f28255Scgd.Dv NUL Ns -terminated
8561f28255Scgd.Fa string
8661f28255Scgdagainst the compiled regular expression
8761f28255Scgdin
8861f28255Scgd.Fa prog .
8961f28255ScgdIt returns 1 for success and 0 for failure, and adjusts the contents of
9061f28255Scgd.Fa prog Ns 's
9161f28255Scgd.Em startp
9261f28255Scgdand
9361f28255Scgd.Em endp
9461f28255Scgd(see below) accordingly.
9561f28255Scgd.Pp
9661f28255ScgdThe members of a
976b57aa9fSfair.Em regexp
9861f28255Scgdstructure include at least the following (not necessarily in order):
9961f28255Scgd.Bd -literal -offset indent
10061f28255Scgdchar *startp[NSUBEXP];
10161f28255Scgdchar *endp[NSUBEXP];
10261f28255Scgd.Ed
10361f28255Scgd.Pp
10461f28255Scgdwhere
10561f28255Scgd.Dv NSUBEXP
10661f28255Scgdis defined (as 10) in the header file.
10761f28255ScgdOnce a successful
10861f28255Scgd.Fn regexec
10961f28255Scgdhas been done using the
11061f28255Scgd.Fn regexp ,
11161f28255Scgdeach
11261f28255Scgd.Em startp Ns - Em endp
11361f28255Scgdpair describes one substring
11461f28255Scgdwithin the
11561f28255Scgd.Fa string ,
11661f28255Scgdwith the
11761f28255Scgd.Em startp
11861f28255Scgdpointing to the first character of the substring and
11961f28255Scgdthe
12061f28255Scgd.Em endp
12161f28255Scgdpointing to the first character following the substring.
12261f28255ScgdThe 0th substring is the substring of
12361f28255Scgd.Fa string
12461f28255Scgdthat matched the whole
12561f28255Scgdregular expression.
12661f28255ScgdThe others are those substrings that matched parenthesized expressions
12761f28255Scgdwithin the regular expression, with parenthesized expressions numbered
12861f28255Scgdin left-to-right order of their opening parentheses.
12961f28255Scgd.Pp
13061f28255ScgdThe
13161f28255Scgd.Fn regsub
13261f28255Scgdfunction
13361f28255Scgdcopies
13461f28255Scgd.Fa source
13561f28255Scgdto
13661f28255Scgd.Fa dest ,
13761f28255Scgdmaking substitutions according to the
13861f28255Scgdmost recent
13961f28255Scgd.Fn regexec
14061f28255Scgdperformed using
14161f28255Scgd.Fa prog .
142*01869ca4SwizEach instance of `&' in
14361f28255Scgd.Fa source
14461f28255Scgdis replaced by the substring
14561f28255Scgdindicated by
14661f28255Scgd.Em startp Ns Bq
14761f28255Scgdand
14861f28255Scgd.Em endp Ns Bq .
14961f28255ScgdEach instance of
15061f28255Scgd.Sq \e Ns Em n ,
15161f28255Scgdwhere
15261f28255Scgd.Em n
15361f28255Scgdis a digit, is replaced by
15461f28255Scgdthe substring indicated by
15561f28255Scgd.Em startp Ns Bq Em n
15661f28255Scgdand
15761f28255Scgd.Em endp Ns Bq Em n .
158*01869ca4SwizTo get a literal `&' or
15961f28255Scgd.Sq \e Ns Em n
16061f28255Scgdinto
16161f28255Scgd.Fa dest ,
16261f28255Scgdprefix it with `\e';
163*01869ca4Swizto get a literal `\e' preceding `&' or
16461f28255Scgd.Sq \e Ns Em n ,
16561f28255Scgdprefix it with
16661f28255Scgdanother `\e'.
16761f28255Scgd.Pp
16861f28255ScgdThe
16961f28255Scgd.Fn regerror
17061f28255Scgdfunction
17161f28255Scgdis called whenever an error is detected in
17261f28255Scgd.Fn regcomp ,
17361f28255Scgd.Fn regexec ,
17461f28255Scgdor
17561f28255Scgd.Fn regsub .
17661f28255ScgdThe default
17761f28255Scgd.Fn regerror
17861f28255Scgdwrites the string
17961f28255Scgd.Fa msg ,
18061f28255Scgdwith a suitable indicator of origin,
18161f28255Scgdon the standard
18261f28255Scgderror output
18361f28255Scgdand invokes
1843f0dbdf7Smikel.Xr exit 3 .
18561f28255ScgdThe
18661f28255Scgd.Fn regerror
18761f28255Scgdfunction
18861f28255Scgdcan be replaced by the user if other actions are desirable.
18961f28255Scgd.Sh REGULAR EXPRESSION SYNTAX
19061f28255ScgdA regular expression is zero or more
19161f28255Scgd.Em branches ,
19261f28255Scgdseparated by `|'.
19361f28255ScgdIt matches anything that matches one of the branches.
19461f28255Scgd.Pp
19561f28255ScgdA branch is zero or more
19661f28255Scgd.Em pieces ,
19761f28255Scgdconcatenated.
19861f28255ScgdIt matches a match for the first, followed by a match for the second, etc.
19961f28255Scgd.Pp
20061f28255ScgdA piece is an
20161f28255Scgd.Em atom
20261f28255Scgdpossibly followed by `*', `+', or `?'.
20361f28255ScgdAn atom followed by `*' matches a sequence of 0 or more matches of the atom.
20461f28255ScgdAn atom followed by `+' matches a sequence of 1 or more matches of the atom.
20561f28255ScgdAn atom followed by `?' matches a match of the atom, or the null string.
20661f28255Scgd.Pp
20761f28255ScgdAn atom is a regular expression in parentheses (matching a match for the
20861f28255Scgdregular expression), a
20961f28255Scgd.Em range
21061f28255Scgd(see below), `.'
21161f28255Scgd(matching any single character), `^' (matching the null string at the
21261f28255Scgdbeginning of the input string), `$' (matching the null string at the
21361f28255Scgdend of the input string), a `\e' followed by a single character (matching
21461f28255Scgdthat character), or a single character with no other significance
21561f28255Scgd(matching that character).
21661f28255Scgd.Pp
21761f28255ScgdA
21861f28255Scgd.Em range
21961f28255Scgdis a sequence of characters enclosed in `[]'.
22061f28255ScgdIt normally matches any single character from the sequence.
22161f28255ScgdIf the sequence begins with `^',
22261f28255Scgdit matches any single character
22361f28255Scgd.Em not
22461f28255Scgdfrom the rest of the sequence.
22561f28255ScgdIf two characters in the sequence are separated by `\-', this is shorthand
22661f28255Scgdfor the full list of
22761f28255Scgd.Tn ASCII
22861f28255Scgdcharacters between them
22961f28255Scgd(e.g. `[0-9]' matches any decimal digit).
23061f28255ScgdTo include a literal `]' in the sequence, make it the first character
23161f28255Scgd(following a possible `^').
23261f28255ScgdTo include a literal `\-', make it the first or last character.
23361f28255Scgd.Sh AMBIGUITY
23461f28255ScgdIf a regular expression could match two different parts of the input string,
23561f28255Scgdit will match the one which begins earliest.
23661f28255ScgdIf both begin in the same place but match different lengths, or match
23761f28255Scgdthe same length in different ways, life gets messier, as follows.
23861f28255Scgd.Pp
23961f28255ScgdIn general, the possibilities in a list of branches are considered in
24061f28255Scgdleft-to-right order, the possibilities for `*', `+', and `?' are
24161f28255Scgdconsidered longest-first, nested constructs are considered from the
24261f28255Scgdoutermost in, and concatenated constructs are considered leftmost-first.
24361f28255ScgdThe match that will be chosen is the one that uses the earliest
24461f28255Scgdpossibility in the first choice that has to be made.
24561f28255ScgdIf there is more than one choice, the next will be made in the same manner
24661f28255Scgd(earliest possibility) subject to the decision on the first choice.
24761f28255ScgdAnd so forth.
24861f28255Scgd.Pp
24961f28255ScgdFor example,
25061f28255Scgd.Sq Li (ab|a)b*c
25161f28255Scgdcould match
25261f28255Scgd`abc' in one of two ways.
25361f28255ScgdThe first choice is between `ab' and `a'; since `ab' is earlier, and does
25461f28255Scgdlead to a successful overall match, it is chosen.
25561f28255ScgdSince the `b' is already spoken for,
25661f28255Scgdthe `b*' must match its last possibility\(emthe empty string\(emsince
25761f28255Scgdit must respect the earlier choice.
25861f28255Scgd.Pp
25961f28255ScgdIn the particular case where no `|'s are present and there is only one
26061f28255Scgd`*', `+', or `?', the net effect is that the longest possible
26161f28255Scgdmatch will be chosen.
26261f28255ScgdSo
26361f28255Scgd.Sq Li ab* ,
26461f28255Scgdpresented with `xabbbby', will match `abbbb'.
26561f28255ScgdNote that if
26661f28255Scgd.Sq Li ab* ,
26761f28255Scgdis tried against `xabyabbbz', it
26861f28255Scgdwill match `ab' just after `x', due to the begins-earliest rule.
26961f28255Scgd(In effect, the decision on where to start the match is the first choice
27061f28255Scgdto be made, hence subsequent choices must respect it even if this leads them
27161f28255Scgdto less-preferred alternatives.)
27261f28255Scgd.Sh RETURN VALUES
27361f28255ScgdThe
27461f28255Scgd.Fn regcomp
27561f28255Scgdfunction
27661f28255Scgdreturns
27761f28255Scgd.Dv NULL
27861f28255Scgdfor a failure
27961f28255Scgd.Pf ( Fn regerror
28061f28255Scgdpermitting),
28161f28255Scgdwhere failures are syntax errors, exceeding implementation limits,
28261f28255Scgdor applying `+' or `*' to a possibly-null operand.
28361f28255Scgd.Sh SEE ALSO
28461f28255Scgd.Xr ed 1 ,
285967a5d86Swiz.Xr egrep 1 ,
28661f28255Scgd.Xr ex 1 ,
28761f28255Scgd.Xr expr 1 ,
28861f28255Scgd.Xr fgrep 1 ,
28961f28255Scgd.Xr grep 1 ,
290406388c7Sdholland.Xr regex 3 ,
291406388c7Sdholland.Xr re_format 7
29261f28255Scgd.Sh HISTORY
29361f28255ScgdBoth code and manual page for
29461f28255Scgd.Fn regcomp ,
29561f28255Scgd.Fn regexec ,
29661f28255Scgd.Fn regsub ,
29761f28255Scgdand
29861f28255Scgd.Fn regerror
29961f28255Scgdwere written at the University of Toronto
30061f28255Scgdand appeared in
30161f28255Scgd.Bx 4.3 tahoe .
30261f28255ScgdThey are intended to be compatible with the Bell V8
30361f28255Scgd.Xr regexp 3 ,
30461f28255Scgdbut are not derived from Bell code.
30561f28255Scgd.Sh BUGS
30661f28255ScgdEmpty branches and empty regular expressions are not portable to V8.
30761f28255Scgd.Pp
30861f28255ScgdThe restriction against
30961f28255Scgdapplying `*' or `+' to a possibly-null operand is an artifact of the
31061f28255Scgdsimplistic implementation.
31161f28255Scgd.Pp
31261f28255ScgdDoes not support
31342aff08cSfair.Xr egrep 1 Ns 's
31461f28255Scgdnewline-separated branches;
31561f28255Scgdneither does the V8
31661f28255Scgd.Xr regexp 3 ,
31761f28255Scgdthough.
31861f28255Scgd.Pp
31961f28255ScgdDue to emphasis on
32061f28255Scgdcompactness and simplicity,
32161f28255Scgdit's not strikingly fast.
32261f28255ScgdIt does give special attention to handling simple cases quickly.
323