Return to BSD News archive
Path: sserve!newshost.anu.edu.au!munnari.oz.au!news.Hawaii.Edu!ames!saimiri.primate.wisc.edu!zaphod.mps.ohio-state.edu!cs.utexas.edu!uunet!mcsun!dkuug!dde!kim
From: kim@dde.dk (Kim Andersen)
Newsgroups: comp.os.386bsd.bugs
Subject: re_exec, re_comp etc. code
Message-ID: <1993Mar18.174631.28789@dde.dk>
Date: 18 Mar 93 17:46:31 GMT
Organization: Dansk Data Elektronik A/S
Lines: 1481
Here are patches to add re_exec and companion functions.
The code is written by Ozan Yigit, and kindly donated to the public domain.
To apply:
patch -d/usr/src/lib/libc/gen -p0 -N < "what ever you saved this file as"
*** Makefile.inc~ Thu Mar 18 18:31:13 1993
--- Makefile.inc Thu Mar 18 18:31:13 1993
***************
*** 1,7 ****
# @(#)Makefile.inc 5.21 (Berkeley) 5/24/91
# gen sources
! .PATH: ${.CURDIR}/${MACHINE}/gen ${.CURDIR}/gen ${.CURDIR}/gen/regexp
SRCS+= alarm.c clock.c closedir.c crypt_dummy.c ctermid.c ctime.c ctype_.c \
difftime.c disklabel.c errlst.c exec.c fnmatch.c frexp.c fstab.c \
--- 1,7 ----
# @(#)Makefile.inc 5.21 (Berkeley) 5/24/91
# gen sources
! .PATH: ${.CURDIR}/${MACHINE}/gen ${.CURDIR}/gen ${.CURDIR}/gen/regexp ${.CURDIR}/gen/regex
SRCS+= alarm.c clock.c closedir.c crypt_dummy.c ctermid.c ctime.c ctype_.c \
difftime.c disklabel.c errlst.c exec.c fnmatch.c frexp.c fstab.c \
***************
*** 21,26 ****
--- 21,29 ----
# gen/regexp sources
SRCS+= regerror.c regexp.c regsub.c
+ # gen/regex sources
+ SRCS+= re_fail.c regex.c
+
.if (${MACHINE} == "hp300")
SRCS+= _setjmp.s alloca.s fabs.s ldexp.s modf.s setjmp.s
SRCS+= adddf3.s addsf3.s ashlsi3.s ashrsi3.s cmpdf2.s cmpsf2.s divdf3.s \
***************
*** 47,53 ****
isalpha.0 isascii.0 iscntrl.0 isdigit.0 isgraph.0 isinf.0 \
islower.0 isprint.0 ispunct.0 isspace.0 isupper.0 isxdigit.0 \
ldexp.0 modf.0 nice.0 nlist.0 pause.0 popen.0 psignal.0 \
! raise.0 regexp.0 scandir.0 setjmp.0 setmode.0 setuid.0 \
siginterrupt.0 signal.0 sigsetops.0 sleep.0 syslog.0 time.0 \
times.0 timezone.0 tolower.0 toupper.0 ttyname.0 tzset.0 \
ualarm.0 unvis.0 usleep.0 utime.0 valloc.0 vis.0
--- 50,56 ----
isalpha.0 isascii.0 iscntrl.0 isdigit.0 isgraph.0 isinf.0 \
islower.0 isprint.0 ispunct.0 isspace.0 isupper.0 isxdigit.0 \
ldexp.0 modf.0 nice.0 nlist.0 pause.0 popen.0 psignal.0 \
! raise.0 regex.0 regexp.0 scandir.0 setjmp.0 setmode.0 setuid.0 \
siginterrupt.0 signal.0 sigsetops.0 sleep.0 syslog.0 time.0 \
times.0 timezone.0 tolower.0 toupper.0 ttyname.0 tzset.0 \
ualarm.0 unvis.0 usleep.0 utime.0 valloc.0 vis.0
***************
*** 73,78 ****
--- 76,82 ----
MLINKS+=glob.3 globfree.3
MLINKS+=popen.3 pclose.3
MLINKS+=psignal.3 sys_siglist.3
+ MLINKS+=regex.3
MLINKS+=regexp.3 regcomp.3 regexp.3 regexec.3 regexp.3 regsub.3 \
regexp.3 regerror.3
MLINKS+=scandir.3 alphasort.3
*** /dev/null Thu Mar 18 15:24:08 1993
--- regex/grep.c Thu Mar 18 18:31:14 1993
***************
*** 0 ****
--- 1,67 ----
+ #ifdef vms
+ #include stdio
+ #else
+ #include <stdio.h>
+ #endif
+
+ /*
+ * Rudimentary grep to test regex routines.
+ *
+ * DEBUG is only applicable
+ * to oZ version of regex. Make sure to
+ * compile regex.c with -DDEBUG as well.
+ *
+ */
+ extern char *re_comp();
+ extern re_exec();
+
+ main(argc,argv)
+ char *argv[];
+ {
+ char str[512];
+ FILE *f;
+ register int n;
+ register char *p;
+
+ if (argc < 2)
+ error("usage: grep pat [file]");
+
+ if ((p = re_comp(argv[1])) != 0) {
+ printf("\t%s: %s\n", p, argv[1]);
+ exit(1);
+ }
+ #ifdef DEBUG
+ symbolic(argv[1]);
+ #endif
+ if (p = argv[2]) {
+ if ((f = fopen(p, "r")) == NULL) {
+ printf("cannot open %s\n", argv[2]);
+ exit(1);
+ }
+ while ((n = load(str, f)) != EOF)
+ if (re_exec(str))
+ printf("%s\n",str);
+
+ }
+ exit(0);
+ }
+ load (s, f)
+ char *s;
+ FILE *f;
+ {
+ register int c;
+ static int lineno = 0;
+
+ while ((c = getc(f)) != '\n' && c != EOF)
+ *s++ = c;
+ if (c == EOF)
+ return (EOF);
+ *s = (char) 0;
+ return (++lineno);
+ }
+
+ error(s) char *s ;
+ {
+ fprintf(stderr,"%s\n",s);
+ exit(1);
+ }
*** /dev/null Thu Mar 18 15:24:08 1993
--- regex/makefile Thu Mar 18 18:31:15 1993
***************
*** 0 ****
--- 1,94 ----
+ #
+ # regex routines (PUBLIC DOMAIN)
+ #
+ # by: Ozan S. Yigit (oz)
+ # Dept. of Computer Science
+ # York University
+ #
+ # Applicable to BSD:
+ #
+ # If you have the generic(?) regex routines
+ # than you can compare the timings of these
+ # routines to the generic ones by:
+ #
+ # make times
+ #
+ # which will create two rudimentary greps
+ # lgrep and ogrep. lgrep will use the generic
+ # regex routines, and ogrep will use oz version
+ # of regex. Several patterns will be searched
+ # in /usr/dict/words, and the output of the greps
+ # will be compared. [for reasons of sanity]
+ #
+ # Surely, you will note, the time tests are somewhat
+ # biased, since /usr/dict/words contain *short* lines,
+ # thereby the real-life case of searching a complex
+ # expression within a long line is not covered. You
+ # will find, however, that the PD regex routines will
+ # search *as fast* as the generic ones in most
+ # cases, and about 10% slower in some cases, when
+ # tested with files containing *long* lines.
+ #
+ CFLAGS = -O
+ #
+ # test patterns
+ #
+ PAT1 = '[d-f]zz*.*m'
+ PAT2 = 'fo[ornt]*.*b[a-d]*'
+ PAT3 = '.th.'
+ PAT4 = '\(ab\)[a-d]*\1'
+ PAT5 = 'burp'
+
+ FILE = /usr/dict/words
+ OUTD = /tmp/
+
+ RSRC = regex.o re_fail.o
+
+ regex: $(RSRC)
+ #
+ # insert code to put these into a library
+ #
+ rlint:
+ lint -phc regex.c
+ debug:
+ cc -O -DDEBUG -o ogrep grep.c regex.c re_fail.c
+
+ lgrep: grep.o
+ cc -o lgrep grep.o
+
+ ogrep: grep.o $(RSRC)
+ cc -o ogrep grep.o $(RSRC)
+
+ times: lgrep ogrep
+ @echo generic regex vs oz regex
+ # @echo pattern: $(PAT1)
+ time ogrep $(PAT1) $(FILE) >$(OUTD)ogrep.out
+ time lgrep $(PAT1) $(FILE) >$(OUTD)lgrep.out
+ @echo output differences:
+ -diff $(OUTD)ogrep.out $(OUTD)lgrep.out
+ @echo "---"
+ # @echo pattern: $(PAT2)
+ time ogrep $(PAT2) $(FILE) >$(OUTD)ogrep.out
+ time lgrep $(PAT2) $(FILE) >$(OUTD)lgrep.out
+ @echo output differences:
+ -diff $(OUTD)ogrep.out $(OUTD)lgrep.out
+ @echo "---"
+ # echo pattern: $(PAT3)
+ time ogrep $(PAT3) $(FILE) >$(OUTD)ogrep.out
+ time lgrep $(PAT3) $(FILE) >$(OUTD)lgrep.out
+ @echo output differences:
+ -diff $(OUTD)ogrep.out $(OUTD)lgrep.out
+ @echo "---"
+ # echo pattern: $(PAT4)
+ time ogrep $(PAT4) $(FILE) >$(OUTD)ogrep.out
+ time lgrep $(PAT4) $(FILE) >$(OUTD)lgrep.out
+ @echo output differences:
+ -diff $(OUTD)ogrep.out $(OUTD)lgrep.out
+ @echo "---"
+ # echo pattern: $(PAT5)
+ time ogrep $(PAT5) $(FILE) >$(OUTD)ogrep.out
+ time lgrep $(PAT5) $(FILE) >$(OUTD)lgrep.out
+ @echo output differences:
+ -diff $(OUTD)ogrep.out $(OUTD)lgrep.out
+ @echo "---"
+ @rm $(OUTD)ogrep.out $(OUTD)lgrep.out
*** /dev/null Thu Mar 18 15:24:08 1993
--- regex/re_fail.c Thu Mar 18 18:31:16 1993
***************
*** 0 ****
--- 1,22 ----
+
+ #ifdef vms
+ #include stdio
+ #else
+ #include <stdio.h>
+ #endif
+
+ /*
+ * re_fail:
+ * internal error handler for re_exec.
+ *
+ * should probably do something like a
+ * longjump to recover gracefully.
+ */
+ void
+ re_fail(s, c)
+ char *s;
+ char c;
+ {
+ fprintf(stderr, "%s [opcode %o]\n", s, c);
+ exit(1);
+ }
*** /dev/null Thu Mar 18 15:24:08 1993
--- regex/regex.3 Thu Mar 18 18:31:16 1993
***************
*** 0 ****
--- 1,326 ----
+ .TH regex 3 local
+ .DA Jun 19 1986
+ .SH NAME
+ re_comp, re_exec, re_subs, re_modw, re_fail \- regular expression handling
+ .SH ORIGIN
+ Dept. of Computer Science
+ .br
+ York University
+ .SH SYNOPSIS
+ .B char *re_comp(pat)
+ .br
+ .B char *pat;
+ .PP
+ .B re_exec(str)
+ .br
+ .B char *str;
+ .PP
+ .B re_subs(src, dst)
+ .br
+ .B char *src;
+ .br
+ .B char *dst;
+ .PP
+ .B void re_fail(msg, op)
+ .br
+ .B char *msg;
+ .br
+ .B char op;
+ .PP
+ .B void re_modw(str)
+ .br
+ .B char *str;
+
+ .SH DESCRIPTION
+ .PP
+ These functions implement
+ .IR ed (1)-style
+ partial regular expressions and supporting facilities.
+ .PP
+ .I Re_comp
+ compiles a pattern string into an internal form (a deterministic finite-state
+ automaton) to be executed by
+ .I re_exec
+ for pattern matching.
+ .I Re_comp
+ returns 0 if the pattern is compiled successfully, otherwise it returns an
+ error message string. If
+ .I re_comp
+ is called with a 0 or a \fInull\fR string, it returns without changing the
+ currently compiled regular expression.
+ .sp
+ .I Re_comp
+ supports the same limited set of
+ .I regular expressions
+ found in
+ .I ed
+ and Berkeley
+ .IR regex (3)
+ routines:
+ .sp
+ .if n .in +1.6i
+ .if t .in +1i
+ .de Ti
+ .if n .ti -1.6i
+ .if t .ti -1i
+ ..
+ .if n .ta 0.8i +0.8i +0.8i
+ .if t .ta 0.5i +0.5i +0.5i
+ .Ti
+ [1] \fIchar\fR Matches itself, unless it is a special
+ character (meta-character): \fB. \\ [ ] * + ^ $\fR
+
+ .Ti
+ [2] \fB.\fR Matches \fIany\fR character.
+
+ .Ti
+ [3] \fB\\\fR Matches the character following it, except
+ when followed by a digit 1 to 9, \fB(\fR, fB)\fR, \fB<\fR or \fB>\fR.
+ (see [7], [8] and [9]) It is used as an escape character for all
+ other meta-characters, and itself. When used
+ in a set ([4]), it is treated as an ordinary
+ character.
+
+ .Ti
+ [4] \fB[\fIset\fB]\fR Matches one of the characters in the set.
+ If the first character in the set is \fB^\fR,
+ it matches a character NOT in the set. A
+ shorthand
+ .IR S - E
+ is used to specify a set of
+ characters
+ .I S
+ up to
+ .IR E ,
+ inclusive. The special
+ characters \fB]\fR and \fB-\fR have no special
+ meaning if they appear as the first chars
+ in the set.
+ .nf
+ examples: match:
+ [a-z] any lowercase alpha
+ [^]-] any char except ] and -
+ [^A-Z] any char except
+ uppercase alpha
+ [a-zA-Z0-9] any alphanumeric
+ .fi
+
+ .Ti
+ [5] \fB*\fR Any regular expression form [1] to [4], followed by
+ closure char (*) matches zero or more matches of
+ that form.
+
+ .Ti
+ [6] \fB+\fR Same as [5], except it matches one or more.
+
+ .Ti
+ [7] A regular expression in the form [1] to [10], enclosed
+ as \\(\fIform\fR\\) matches what form matches. The enclosure
+ creates a set of tags, used for [8] and for
+ pattern substitution in
+ .I re_subs.
+ The tagged forms are numbered
+ starting from 1.
+
+ .Ti
+ [8] A \\ followed by a digit 1 to 9 matches whatever a
+ previously tagged regular expression ([7]) matched.
+
+ .Ti
+ [9] \fB\\<\fR Matches the beginning of a \fIword\fR,
+ that is, an empty string followed by a
+ letter, digit, or _ and not preceded by
+ a letter, digit, or _ .
+ .Ti
+ \fB\\>\fR Matches the end of a \fIword\fR,
+ that is, an empty string preceded
+ by a letter, digit, or _ , and not
+ followed by a letter, digit, or _ .
+
+ .Ti
+ [10] A composite regular expression
+ \fIxy\fR where \fIx\fR and \fIy\fR
+ are in the form of [1] to [10] matches the longest
+ match of \fIx\fR followed by a match for \fIy\fR.
+
+ .Ti
+ [11] \fB^ $\fR a regular expression starting with a \fB^\fR character
+ and/or ending with a \fB$\fR character, restricts the
+ pattern matching to the beginning of the line,
+ and/or the end of line [anchors]. Elsewhere in the
+ pattern, \fB^\fR and \fB$\fR are treated as ordinary characters.
+ .if n .in -1.6i
+ .if t .in -1i
+
+ .PP
+ .I Re_exec
+ executes the internal form produced by
+ .I re_comp
+ and searches the argument string for the regular expression described
+ by the internal
+ form.
+ .I Re_exec
+ returns 1 if the last regular expression pattern is matched within the string,
+ 0 if no match is found. In case of an internal error (corrupted internal
+ form),
+ .I re_exec
+ calls the user-supplied
+ .I re_fail
+ and returns 0.
+ .PP
+ The strings passed to both
+ .I re_comp
+ and
+ .I re_exec
+ may have trailing or embedded newline characters. The strings
+ must be terminated by nulls.
+ .PP
+ .I Re_subs
+ does
+ .IR ed -style
+ pattern substitution, after a successful match is found by
+ .I re_exec.
+ The source string parameter to
+ .I re_subs
+ is copied to the destination string with the following interpretation;
+ .sp
+ .if n .in +1.6i
+ .if t .in +1i
+ .Ti
+ [1] & Substitute the entire matched string in the destination.
+
+ .Ti
+ [2] \\\fIn\fR Substitute the substring matched by a tagged subpattern
+ numbered \fIn\fR, where \fIn\fR is between 1 to 9, inclusive.
+
+ .Ti
+ [3] \\\fIchar\fR Treat the next character literally,
+ unless the character is a digit ([2]).
+ .if n .in -1.6i
+ .if t .in -1i
+
+ .PP
+ If the copy operation with the substitutions is successful,
+ .I re_subs
+ returns 1.
+ If the source string is corrupted, or the last call to
+ .I re_exec
+ fails, it returns 0.
+
+ .I Re_modw
+ is used to
+ add new characters into an internal table to
+ change the re_exec's understanding of what
+ a \fIword\fR should look like, when matching with \fB\\<\fR and \fB\\>\fR
+ constructs. If the string parameter is 0 or null string,
+ the table is reset back to the default, which contains \fBA-Z a-z 0-9 _\fR .
+
+ .I Re_fail
+ is a user-supplied routine to handle internal errors.
+ .I re_exec
+ calls
+ .I re_fail
+ with an error message string, and the opcode character that caused the error.
+ The default
+ .I re_fail
+ routine simply prints the message and the opcode character to
+ .I stderr
+ and invokes
+ .IR exit (2).
+ .SH EXAMPLES
+ In the examples below, the
+ .I nfaform
+ describes the internal form after the pattern is compiled. For additional
+ details, refer to the sources.
+ .PP
+ .ta 0.5i +0.5i +0.5i
+ .nf
+ foo*.*
+ nfaform: CHR f CHR o CLO CHR o END CLO ANY END END
+ matches: \fIfo foo fooo foobar fobar foxx ...\fR
+
+ fo[ob]a[rz]
+ nfaform: CHR f CHR o CCL 2 o b CHR a CCL 2 r z END
+ matches: \fIfobar fooar fobaz fooaz\fR
+
+ foo\\\\+
+ nfaform: CHR f CHR o CHR o CHR \\ CLO CHR \\ END END
+ matches: \fIfoo\\ foo\\\\ foo\\\\\\ ...\fR
+
+ \\(foo\\)[1-3]\\1 (same as foo[1-3]foo, but takes less internal space)
+ nfaform: BOT 1 CHR f CHR o CHR o EOT 1 CCL 3 1 2 3 REF 1 END
+ matches: \fIfoo1foo foo2foo foo3foo\fR
+
+ \\(fo.*\\)-\\1
+ nfaform: BOT 1 CHR f CHR o CLO ANY END EOT 1 CHR - REF 1 END
+ matches: \fIfoo-foo fo-fo fob-fob foobar-foobar ...\fR
+ .SH DIAGNOSTICS
+ .I Re_comp
+ returns one of the following strings if an error occurs:
+ .PP
+ .nf
+ .in +0.5i
+ \fINo previous regular expression,
+ Empty closure,
+ Illegal closure,
+ Cyclical reference,
+ Undetermined reference,
+ Unmatched \e(,
+ Missing ],
+ Null pattern inside \e(\e),
+ Null pattern inside \e<\e>,
+ Too many \e(\e) pairs,
+ Unmatched \e)\fP.
+ .in -0.5i
+ .fi
+ .SH REFERENCES
+ .if n .ta 3i
+ .if t .ta 1.8i
+ .nf
+ \fISoftware tools\fR Kernighan & Plauger
+ \fISoftware tools in Pascal\fR Kernighan & Plauger
+ \fIGrep sources\fR [rsx-11 C dist] David Conroy
+ \fIEd - text editor\fR Unix Programmer's Manual
+ \fIAdvanced editing on Unix\fR B. W. Kernighan
+ \fIRegExp sources\fR Henry Spencer
+ .fi
+ .SH "HISTORY AND NOTES"
+ These routines are derived from various implementations
+ found in
+ .I "Software Tools"
+ books, and David Conroy's
+ .I grep.
+ They are NOT derived from licensed/restricted software.
+ For more interesting/academic/complicated implementations,
+ see Henry Spencer's
+ .I regexp
+ routines (V8), or
+ .I "GNU Emacs"
+ pattern
+ matching module.
+ .PP
+ The
+ .I re_comp
+ and
+ .I re_exec
+ routines perform
+ .I almost
+ as well as their licensed counterparts, sometimes better.
+ In very few instances, they
+ are about 10% to 15% slower.
+ .SH AUTHOR
+ Ozan S. Yigit (oz)
+ .br
+ usenet: utzoo!yetti!oz
+ .br
+ bitnet: oz@yusol || oz@yuyetti
+ .SH "SEE ALSO"
+ ed(1), ex(1), egrep(1), fgrep(1), grep(1), regex(3)
+ .SH BUGS
+ These routines are \fIPublic Domain\fR. You can get them
+ in source.
+ .br
+ The internal storage for the \fInfa form\fR is not checked for
+ overflows. Currently, it is 1024 bytes.
+ .br
+ Others, no doubt.
*** /dev/null Thu Mar 18 15:24:08 1993
--- regex/regex.c Thu Mar 18 18:31:17 1993
***************
*** 0 ****
--- 1,877 ----
+ /*
+ * regex - Regular expression pattern matching
+ * and replacement
+ *
+ *
+ * By: Ozan S. Yigit (oz)
+ * Dept. of Computer Science
+ * York University
+ *
+ *
+ * These routines are the PUBLIC DOMAIN equivalents
+ * of regex routines as found in 4.nBSD UN*X, with minor
+ * extensions.
+ *
+ * These routines are derived from various implementations
+ * found in software tools books, and Conroy's grep. They
+ * are NOT derived from licensed/restricted software.
+ * For more interesting/academic/complicated implementations,
+ * see Henry Spencer's regexp routines, or GNU Emacs pattern
+ * matching module.
+ *
+ * Modification history:
+ *
+ * $Log: regex.c,v $
+ * Revision 1.3 89/04/01 14:18:09 oz
+ * Change all references to a dfa: this is actually an nfa.
+ *
+ * Revision 1.2 88/08/28 15:36:04 oz
+ * Use a complement bitmap to represent NCL.
+ * This removes the need to have seperate
+ * code in the pmatch case block - it is
+ * just CCL code now.
+ *
+ * Use the actual CCL code in the CLO
+ * section of pmatch. No need for a recursive
+ * pmatch call.
+ *
+ * Use a bitmap table to set char bits in an
+ * 8-bit chunk.
+ *
+ *
+ * Routines:
+ * re_comp: compile a regular expression into
+ * a NFA.
+ *
+ * char *re_comp(s)
+ * char *s;
+ *
+ * re_exec: execute the NFA to match a pattern.
+ *
+ * int re_exec(s)
+ * char *s;
+ *
+ * re_modw change re_exec's understanding of what
+ * a "word" looks like (for \< and \>)
+ * by adding into the hidden word-syntax
+ * table.
+ *
+ * void re_modw(s)
+ * char *s;
+ *
+ * re_subs: substitute the matched portions in
+ * a new string.
+ *
+ * int re_subs(src, dst)
+ * char *src;
+ * char *dst;
+ *
+ * re_fail: failure routine for re_exec.
+ *
+ * void re_fail(msg, op)
+ * char *msg;
+ * char op;
+ *
+ * Regular Expressions:
+ *
+ * [1] char matches itself, unless it is a special
+ * character (metachar): . \ [ ] * + ^ $
+ *
+ * [2] . matches any character.
+ *
+ * [3] \ matches the character following it, except
+ * when followed by a left or right round bracket,
+ * a digit 1 to 9 or a left or right angle bracket.
+ * (see [7], [8] and [9])
+ * It is used as an escape character for all
+ * other meta-characters, and itself. When used
+ * in a set ([4]), it is treated as an ordinary
+ * character.
+ *
+ * [4] [set] matches one of the characters in the set.
+ * If the first character in the set is "^",
+ * it matches a character NOT in the set, i.e.
+ * complements the set. A shorthand S-E is
+ * used to specify a set of characters S upto
+ * E, inclusive. The special characters "]" and
+ * "-" have no special meaning if they appear
+ * as the first chars in the set.
+ * examples: match:
+ *
+ * [a-z] any lowercase alpha
+ *
+ * [^]-] any char except ] and -
+ *
+ * [^A-Z] any char except uppercase
+ * alpha
+ *
+ * [a-zA-Z] any alpha
+ *
+ * [5] * any regular expression form [1] to [4], followed by
+ * closure char (*) matches zero or more matches of
+ * that form.
+ *
+ * [6] + same as [5], except it matches one or more.
+ *
+ * [7] a regular expression in the form [1] to [10], enclosed
+ * as \(form\) matches what form matches. The enclosure
+ * creates a set of tags, used for [8] and for
+ * pattern substution. The tagged forms are numbered
+ * starting from 1.
+ *
+ * [8] a \ followed by a digit 1 to 9 matches whatever a
+ * previously tagged regular expression ([7]) matched.
+ *
+ * [9] \< a regular expression starting with a \< construct
+ * \> and/or ending with a \> construct, restricts the
+ * pattern matching to the beginning of a word, and/or
+ * the end of a word. A word is defined to be a character
+ * string beginning and/or ending with the characters
+ * A-Z a-z 0-9 and _. It must also be preceded and/or
+ * followed by any character outside those mentioned.
+ *
+ * [10] a composite regular expression xy where x and y
+ * are in the form [1] to [10] matches the longest
+ * match of x followed by a match for y.
+ *
+ * [11] ^ a regular expression starting with a ^ character
+ * $ and/or ending with a $ character, restricts the
+ * pattern matching to the beginning of the line,
+ * or the end of line. [anchors] Elsewhere in the
+ * pattern, ^ and $ are treated as ordinary characters.
+ *
+ *
+ * Acknowledgements:
+ *
+ * HCR's Hugh Redelmeier has been most helpful in various
+ * stages of development. He convinced me to include BOW
+ * and EOW constructs, originally invented by Rob Pike at
+ * the University of Toronto.
+ *
+ * References:
+ * Software tools Kernighan & Plauger
+ * Software tools in Pascal Kernighan & Plauger
+ * Grep [rsx-11 C dist] David Conroy
+ * ed - text editor Un*x Programmer's Manual
+ * Advanced editing on Un*x B. W. Kernighan
+ * RegExp routines Henry Spencer
+ *
+ * Notes:
+ *
+ * This implementation uses a bit-set representation for character
+ * classes for speed and compactness. Each character is represented
+ * by one bit in a 128-bit block. Thus, CCL always takes a
+ * constant 16 bytes in the internal nfa, and re_exec does a single
+ * bit comparison to locate the character in the set.
+ *
+ * Examples:
+ *
+ * pattern: foo*.*
+ * compile: CHR f CHR o CLO CHR o END CLO ANY END END
+ * matches: fo foo fooo foobar fobar foxx ...
+ *
+ * pattern: fo[ob]a[rz]
+ * compile: CHR f CHR o CCL bitset CHR a CCL bitset END
+ * matches: fobar fooar fobaz fooaz
+ *
+ * pattern: foo\\+
+ * compile: CHR f CHR o CHR o CHR \ CLO CHR \ END END
+ * matches: foo\ foo\\ foo\\\ ...
+ *
+ * pattern: \(foo\)[1-3]\1 (same as foo[1-3]foo)
+ * compile: BOT 1 CHR f CHR o CHR o EOT 1 CCL bitset REF 1 END
+ * matches: foo1foo foo2foo foo3foo
+ *
+ * pattern: \(fo.*\)-\1
+ * compile: BOT 1 CHR f CHR o CLO ANY END EOT 1 CHR - REF 1 END
+ * matches: foo-foo fo-fo fob-fob foobar-foobar ...
+ *
+ */
+
+ #define MAXNFA 1024
+ #define MAXTAG 10
+
+ #define OKP 1
+ #define NOP 0
+
+ #define CHR 1
+ #define ANY 2
+ #define CCL 3
+ #define BOL 4
+ #define EOL 5
+ #define BOT 6
+ #define EOT 7
+ #define BOW 8
+ #define EOW 9
+ #define REF 10
+ #define CLO 11
+
+ #define END 0
+
+ /*
+ * The following defines are not meant
+ * to be changeable. They are for readability
+ * only.
+ *
+ */
+ #define MAXCHR 128
+ #define CHRBIT 8
+ #define BITBLK MAXCHR/CHRBIT
+ #define BLKIND 0170
+ #define BITIND 07
+
+ #define ASCIIB 0177
+
+ typedef /*unsigned*/ char CHAR;
+
+ static int tagstk[MAXTAG]; /* subpat tag stack..*/
+ static CHAR nfa[MAXNFA]; /* automaton.. */
+ static int sta = NOP; /* status of lastpat */
+
+ static CHAR bittab[BITBLK]; /* bit table for CCL */
+ /* pre-set bits... */
+ static CHAR bitarr[] = {1,2,4,8,16,32,64,128};
+
+ static void
+ chset(c) register CHAR c; { bittab[((c)&BLKIND)>>3] |= bitarr[(c)&BITIND]; }
+
+ #define badpat(x) return(*nfa = END, x)
+ #define store(x) *mp++ = x
+
+ char *
+ re_comp(pat)
+ char *pat;
+ {
+ register char *p; /* pattern pointer */
+ register CHAR *mp=nfa; /* nfa pointer */
+ register CHAR *lp; /* saved pointer.. */
+ register CHAR *sp=nfa; /* another one.. */
+
+ register int tagi = 0; /* tag stack index */
+ register int tagc = 1; /* actual tag count */
+
+ register int n;
+ register CHAR mask; /* xor mask -CCL/NCL */
+ int c1, c2;
+
+ if (!pat || !*pat)
+ if (sta)
+ return(0);
+ else
+ badpat("No previous regular expression");
+ sta = NOP;
+
+ for (p = pat; *p; p++) {
+ lp = mp;
+ switch(*p) {
+
+ case '.': /* match any char.. */
+ store(ANY);
+ break;
+
+ case '^': /* match beginning.. */
+ if (p == pat)
+ store(BOL);
+ else {
+ store(CHR);
+ store(*p);
+ }
+ break;
+
+ case '$': /* match endofline.. */
+ if (!*(p+1))
+ store(EOL);
+ else {
+ store(CHR);
+ store(*p);
+ }
+ break;
+
+ case '[': /* match char class..*/
+ store(CCL);
+
+ if (*++p == '^') {
+ mask = 0377;
+ p++;
+ }
+ else
+ mask = 0;
+
+ if (*p == '-') /* real dash */
+ chset(*p++);
+ if (*p == ']') /* real brac */
+ chset(*p++);
+ while (*p && *p != ']') {
+ if (*p == '-' && *(p+1) && *(p+1) != ']') {
+ p++;
+ c1 = *(p-2) + 1;
+ c2 = *p++;
+ while (c1 <= c2)
+ chset(c1++);
+ }
+ #ifdef EXTEND
+ else if (*p == '\\' && *(p+1)) {
+ p++;
+ chset(*p++);
+ }
+ #endif
+ else
+ chset(*p++);
+ }
+ if (!*p)
+ badpat("Missing ]");
+
+ for (n = 0; n < BITBLK; bittab[n++] = (char) 0)
+ store(mask ^ bittab[n]);
+
+ break;
+
+ case '*': /* match 0 or more.. */
+ case '+': /* match 1 or more.. */
+ if (p == pat)
+ badpat("Empty closure");
+ lp = sp; /* previous opcode */
+ if (*lp == CLO) /* equivalence.. */
+ break;
+ switch(*lp) {
+
+ case BOL:
+ case BOT:
+ case EOT:
+ case BOW:
+ case EOW:
+ case REF:
+ badpat("Illegal closure");
+ default:
+ break;
+ }
+
+ if (*p == '+')
+ for (sp = mp; lp < sp; lp++)
+ store(*lp);
+
+ store(END);
+ store(END);
+ sp = mp;
+ while (--mp > lp)
+ *mp = mp[-1];
+ store(CLO);
+ mp = sp;
+ break;
+
+ case '\\': /* tags, backrefs .. */
+ switch(*++p) {
+
+ case '(':
+ if (tagc < MAXTAG) {
+ tagstk[++tagi] = tagc;
+ store(BOT);
+ store(tagc++);
+ }
+ else
+ badpat("Too many \\(\\) pairs");
+ break;
+ case ')':
+ if (*sp == BOT)
+ badpat("Null pattern inside \\(\\)");
+ if (tagi > 0) {
+ store(EOT);
+ store(tagstk[tagi--]);
+ }
+ else
+ badpat("Unmatched \\)");
+ break;
+ case '<':
+ store(BOW);
+ break;
+ case '>':
+ if (*sp == BOW)
+ badpat("Null pattern inside \\<\\>");
+ store(EOW);
+ break;
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ case '8':
+ case '9':
+ n = *p-'0';
+ if (tagi > 0 && tagstk[tagi] == n)
+ badpat("Cyclical reference");
+ if (tagc > n) {
+ store(REF);
+ store(n);
+ }
+ else
+ badpat("Undetermined reference");
+ break;
+ #ifdef EXTEND
+ case 'b':
+ store(CHR);
+ store('\b');
+ break;
+ case 'n':
+ store(CHR);
+ store('\n');
+ break;
+ case 'f':
+ store(CHR);
+ store('\f');
+ break;
+ case 'r':
+ store(CHR);
+ store('\r');
+ break;
+ case 't':
+ store(CHR);
+ store('\t');
+ break;
+ #endif
+ default:
+ store(CHR);
+ store(*p);
+ }
+ break;
+
+ default : /* an ordinary char */
+ store(CHR);
+ store(*p);
+ break;
+ }
+ sp = lp;
+ }
+ if (tagi > 0)
+ badpat("Unmatched \\(");
+ store(END);
+ sta = OKP;
+ return(0);
+ }
+
+
+ static char *bol;
+ char *bopat[MAXTAG];
+ char *eopat[MAXTAG];
+ char *pmatch();
+
+ /*
+ * re_exec:
+ * execute nfa to find a match.
+ *
+ * special cases: (nfa[0])
+ * BOL
+ * Match only once, starting from the
+ * beginning.
+ * CHR
+ * First locate the character without
+ * calling pmatch, and if found, call
+ * pmatch for the remaining string.
+ * END
+ * re_comp failed, poor luser did not
+ * check for it. Fail fast.
+ *
+ * If a match is found, bopat[0] and eopat[0] are set
+ * to the beginning and the end of the matched fragment,
+ * respectively.
+ *
+ */
+
+ int
+ re_exec(lp)
+ register char *lp;
+ {
+ register char c;
+ register char *ep = 0;
+ register CHAR *ap = nfa;
+
+ bol = lp;
+
+ bopat[0] = 0;
+ bopat[1] = 0;
+ bopat[2] = 0;
+ bopat[3] = 0;
+ bopat[4] = 0;
+ bopat[5] = 0;
+ bopat[6] = 0;
+ bopat[7] = 0;
+ bopat[8] = 0;
+ bopat[9] = 0;
+
+ switch(*ap) {
+
+ case BOL: /* anchored: match from BOL only */
+ ep = pmatch(lp,ap);
+ break;
+ case CHR: /* ordinary char: locate it fast */
+ c = *(ap+1);
+ while (*lp && *lp != c)
+ lp++;
+ if (!*lp) /* if EOS, fail, else fall thru. */
+ return(0);
+ default: /* regular matching all the way. */
+ while (*lp) {
+ if ((ep = pmatch(lp,ap)))
+ break;
+ lp++;
+ }
+ break;
+ case END: /* munged automaton. fail always */
+ return(0);
+ }
+ if (!ep)
+ return(0);
+
+ bopat[0] = lp;
+ eopat[0] = ep;
+ return(1);
+ }
+
+ /*
+ * pmatch:
+ * internal routine for the hard part
+ *
+ * This code is mostly snarfed from an early
+ * grep written by David Conroy. The backref and
+ * tag stuff, and various other mods are by oZ.
+ *
+ * special cases: (nfa[n], nfa[n+1])
+ * CLO ANY
+ * We KNOW ".*" will match ANYTHING
+ * upto the end of line. Thus, go to
+ * the end of line straight, without
+ * calling pmatch recursively. As in
+ * the other closure cases, the remaining
+ * pattern must be matched by moving
+ * backwards on the string recursively,
+ * to find a match for xy (x is ".*" and
+ * y is the remaining pattern) where
+ * the match satisfies the LONGEST match
+ * for x followed by a match for y.
+ * CLO CHR
+ * We can again scan the string forward
+ * for the single char without recursion,
+ * and at the point of failure, we execute
+ * the remaining nfa recursively, as
+ * described above.
+ *
+ * At the end of a successful match, bopat[n] and eopat[n]
+ * are set to the beginning and end of subpatterns matched
+ * by tagged expressions (n = 1 to 9).
+ *
+ */
+
+ extern void re_fail();
+
+ /*
+ * character classification table for word boundary
+ * operators BOW and EOW. the reason for not using
+ * ctype macros is that we can let the user add into
+ * our own table. see re_modw. This table is not in
+ * the bitset form, since we may wish to extend it
+ * in the future for other character classifications.
+ *
+ * TRUE for 0-9 A-Z a-z _
+ */
+ static char chrtyp[MAXCHR] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
+ 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 0, 0, 0, 0, 1, 0, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 0, 0, 0, 0, 0
+ };
+
+ #define inascii(x) (0177&(x))
+ #define iswordc(x) chrtyp[inascii(x)]
+ #define isinset(x,y) ((x)[((y)&BLKIND)>>3] & bitarr[(y)&BITIND])
+
+ /*
+ * skip values for CLO XXX to skip past the closure
+ *
+ */
+
+ #define ANYSKIP 2 /* [CLO] ANY END ... */
+ #define CHRSKIP 3 /* [CLO] CHR chr END ... */
+ #define CCLSKIP 18 /* [CLO] CCL 16bytes END ... */
+
+ static char *
+ pmatch(lp, ap)
+ register char *lp;
+ register CHAR *ap;
+ {
+ register int op, c, n;
+ register char *e; /* extra pointer for CLO */
+ register char *bp; /* beginning of subpat.. */
+ register char *ep; /* ending of subpat.. */
+ char *are; /* to save the line ptr. */
+
+ while ((op = *ap++) != END)
+ switch(op) {
+
+ case CHR:
+ if (*lp++ != *ap++)
+ return(0);
+ break;
+ case ANY:
+ if (!*lp++)
+ return(0);
+ break;
+ case CCL:
+ c = *lp++;
+ if (!isinset(ap,c))
+ return(0);
+ ap += BITBLK;
+ break;
+ case BOL:
+ if (lp != bol)
+ return(0);
+ break;
+ case EOL:
+ if (*lp)
+ return(0);
+ break;
+ case BOT:
+ bopat[*ap++] = lp;
+ break;
+ case EOT:
+ eopat[*ap++] = lp;
+ break;
+ case BOW:
+ if (lp!=bol && iswordc(lp[-1]) || !iswordc(*lp))
+ return(0);
+ break;
+ case EOW:
+ if (lp==bol || !iswordc(lp[-1]) || iswordc(*lp))
+ return(0);
+ break;
+ case REF:
+ n = *ap++;
+ bp = bopat[n];
+ ep = eopat[n];
+ while (bp < ep)
+ if (*bp++ != *lp++)
+ return(0);
+ break;
+ case CLO:
+ are = lp;
+ switch(*ap) {
+
+ case ANY:
+ while (*lp)
+ lp++;
+ n = ANYSKIP;
+ break;
+ case CHR:
+ c = *(ap+1);
+ while (*lp && c == *lp)
+ lp++;
+ n = CHRSKIP;
+ break;
+ case CCL:
+ while ((c = *lp) && isinset(ap+1,c))
+ lp++;
+ n = CCLSKIP;
+ break;
+ default:
+ re_fail("closure: bad nfa.", *ap);
+ return(0);
+ }
+
+ ap += n;
+
+ while (lp >= are) {
+ if (e = pmatch(lp, ap))
+ return(e);
+ --lp;
+ }
+ return(0);
+ default:
+ re_fail("re_exec: bad nfa.", op);
+ return(0);
+ }
+ return(lp);
+ }
+
+ /*
+ * re_modw:
+ * add new characters into the word table to
+ * change the re_exec's understanding of what
+ * a word should look like. Note that we only
+ * accept additions into the word definition.
+ *
+ * If the string parameter is 0 or null string,
+ * the table is reset back to the default, which
+ * contains A-Z a-z 0-9 _. [We use the compact
+ * bitset representation for the default table]
+ *
+ */
+
+ static char deftab[16] = {
+ 0, 0, 0, 0, 0, 0, 377, 003, 376, 377, 377, 207,
+ 376, 377, 377, 007
+ };
+
+ void
+ re_modw(s)
+ register char *s;
+ {
+ register int i;
+
+ if (!s || !*s) {
+ for (i = 0; i < MAXCHR; i++)
+ if (!isinset(deftab,i))
+ iswordc(i) = 0;
+ }
+ else
+ while(*s)
+ iswordc(*s++) = 1;
+ }
+
+ /*
+ * re_subs:
+ * substitute the matched portions of the src in
+ * dst.
+ *
+ * & substitute the entire matched pattern.
+ *
+ * \digit substitute a subpattern, with the given
+ * tag number. Tags are numbered from 1 to
+ * 9. If the particular tagged subpattern
+ * does not exist, null is substituted.
+ *
+ */
+ int
+ re_subs(src, dst)
+ register char *src;
+ register char *dst;
+ {
+ register char c;
+ register int pin;
+ register char *bp;
+ register char *ep;
+
+ if (!*src || !bopat[0])
+ return(0);
+
+ while (c = *src++) {
+ switch(c) {
+
+ case '&':
+ pin = 0;
+ break;
+
+ case '\\':
+ c = *src++;
+ if (c >= '0' && c <= '9') {
+ pin = c - '0';
+ break;
+ }
+
+ default:
+ *dst++ = c;
+ continue;
+ }
+
+ if ((bp = bopat[pin]) && (ep = eopat[pin])) {
+ while (*bp && bp < ep)
+ *dst++ = *bp++;
+ if (bp < ep)
+ return(0);
+ }
+ }
+ *dst = (char) 0;
+ return(1);
+ }
+
+ #ifdef DEBUG
+ /*
+ * symbolic - produce a symbolic dump of the
+ * nfa
+ */
+ symbolic(s)
+ char *s;
+ {
+ printf("pattern: %s\n", s);
+ printf("nfacode:\n");
+ nfadump(nfa);
+ }
+
+ static
+ nfadump(ap)
+ CHAR *ap;
+ {
+ register int n;
+
+ while (*ap != END)
+ switch(*ap++) {
+ case CLO:
+ printf("CLOSURE");
+ nfadump(ap);
+ switch(*ap) {
+ case CHR:
+ n = CHRSKIP;
+ break;
+ case ANY:
+ n = ANYSKIP;
+ break;
+ case CCL:
+ n = CCLSKIP;
+ break;
+ }
+ ap += n;
+ break;
+ case CHR:
+ printf("\tCHR %c\n",*ap++);
+ break;
+ case ANY:
+ printf("\tANY .\n");
+ break;
+ case BOL:
+ printf("\tBOL -\n");
+ break;
+ case EOL:
+ printf("\tEOL -\n");
+ break;
+ case BOT:
+ printf("BOT: %d\n",*ap++);
+ break;
+ case EOT:
+ printf("EOT: %d\n",*ap++);
+ break;
+ case BOW:
+ printf("BOW\n");
+ break;
+ case EOW:
+ printf("EOW\n");
+ break;
+ case REF:
+ printf("REF: %d\n",*ap++);
+ break;
+ case CCL:
+ printf("\tCCL [");
+ for (n = 0; n < MAXCHR; n++)
+ if (isinset(ap,(CHAR)n)) {
+ if (n < ' ')
+ printf("^%c", n ^ 0x040);
+ else
+ printf("%c", n);
+ }
+ printf("]\n");
+ ap += BITBLK;
+ break;
+ default:
+ printf("bad nfa. opcode %o\n", ap[-1]);
+ exit(1);
+ break;
+ }
+ }
+ #endif
--
--
Kim Andersen @ Dansk Data Elektronik A/S, Herlev, Denmark.
E-mail: kim@dde.dk or ...!uunet!mcsun!dkuug!dde!kim